traject 1.1.0 → 2.0.0.rc.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -1,351 +0,0 @@
1
- require 'yell'
2
-
3
- require 'traject'
4
- require 'traject/util'
5
- require 'traject/qualified_const_get'
6
- require 'traject/thread_pool'
7
-
8
- require 'uri'
9
- require 'thread' # for Mutex
10
-
11
- #
12
- # Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
13
- #
14
- # After you call #close, you can check #skipped_record_count if you want
15
- # for an integer count of skipped records.
16
- #
17
- # For fatal errors that raise... async processing with thread_pool means that
18
- # you may not get a raise immediately after calling #put, you may get it on
19
- # a FUTURE #put or #close. You should get it eventually though.
20
- #
21
- # ## Settings
22
- #
23
- # * solr.url: Your solr url (required)
24
- #
25
- # * solrj_writer.server_class_name: Defaults to "HttpSolrServer". You can specify
26
- # another Solr Server sub-class, but it has
27
- # to take a one-arg url constructor. Maybe
28
- # subclass this writer class and overwrite
29
- # instantiate_solr_server! otherwise
30
- #
31
- # * solrj.jar_dir: Custom directory containing all of the SolrJ jars. All
32
- # jars in this dir will be loaded. Otherwise,
33
- # we load our own packaged solrj jars. This setting
34
- # can't really be used differently in the same app instance,
35
- # since jars are loaded globally.
36
- #
37
- # * solrj_writer.parser_class_name: A String name of a class in package
38
- # org.apache.solr.client.solrj.impl,
39
- # we'll instantiate one with a zero-arg
40
- # constructor, and pass it as an arg to setParser on
41
- # the SolrServer instance, if present.
42
- # NOTE: For contacting a Solr 1.x server, with the
43
- # recent version of SolrJ used by default, set to
44
- # "XMLResponseParser"
45
- #
46
- # * solrj_writer.commit_on_close: If true (or string 'true'), send a commit to solr
47
- # at end of #process.
48
- #
49
- # * solrj_writer.batch_size: If non-nil and more than 1, send documents to
50
- # solr in batches of solrj_writer.batch_size. If nil/1,
51
- # however, an http transaction with solr will be done
52
- # per doc. DEFAULT to 100, which seems to be a sweet spot.
53
- #
54
- # * solrj_writer.thread_pool: Defaults to 1. A thread pool is used for submitting docs
55
- # to solr. Set to 0 or nil to disable threading. Set to 1,
56
- # there will still be a single bg thread doing the adds. For
57
- # very fast Solr servers and very fast indexing processes, may
58
- # make sense to increase this value to throw at Solr as fast as it
59
- # can catch.
60
- #
61
- # ## Example
62
- #
63
- # settings do
64
- # provide "writer_class_name", "Traject::SolrJWriter"
65
- #
66
- # # This is just regular ruby, so don't be afraid to have conditionals!
67
- # # Switch on hostname, for test and production server differences
68
- # if Socket.gethostname =~ /devhost/
69
- # provide "solr.url", "http://my.dev.machine:9033/catalog"
70
- # else
71
- # provide "solr.url", "http://my.production.machine:9033/catalog"
72
- # end
73
- #
74
- # provide "solrj_writer.parser_class_name", "BinaryResponseParser" # for Solr 4.x
75
- # # provide "solrj_writer.parser_class_name", "XMLResponseParser" # For solr 1.x or 3.x
76
- #
77
- # provide "solrj_writer.commit_on_close", "true"
78
- # end
79
- class Traject::SolrJWriter
80
- # just a tuple of a SolrInputDocument
81
- # and a Traject::Indexer::Context it came from
82
- class UpdatePackage
83
- attr_accessor :solr_document, :context
84
- def initialize(doc, ctx)
85
- self.solr_document = doc
86
- self.context = ctx
87
- end
88
- end
89
-
90
- include Traject::QualifiedConstGet
91
-
92
- attr_reader :settings
93
-
94
- attr_reader :batched_queue
95
-
96
- def initialize(argSettings)
97
- @settings = Traject::Indexer::Settings.new(argSettings)
98
- settings_check!(settings)
99
-
100
- ensure_solrj_loaded!
101
-
102
- solr_server # init
103
-
104
- @batched_queue = java.util.concurrent.LinkedBlockingQueue.new
105
-
106
- # when multi-threaded exceptions raised in threads are held here
107
- # we need a HIGH performance queue here to try and avoid slowing things down,
108
- # since we need to check it frequently.
109
- @async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
110
-
111
- # Store error count in an AtomicInteger, so multi threads can increment
112
- # it safely, if we're threaded.
113
- @skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
114
-
115
- # if our thread pool settings are 0, it'll just create a null threadpool that
116
- # executes in calling context.
117
- @thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
118
-
119
- @debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
120
-
121
- logger.info(" #{self.class.name} writing to '#{settings['solr.url']}'")
122
- end
123
-
124
- # Loads solrj if not already loaded. By loading all jars found
125
- # in settings["solrj.jar_dir"]
126
- def ensure_solrj_loaded!
127
- unless defined?(HttpSolrServer) && defined?(SolrInputDocument)
128
- Traject::Util.require_solrj_jars(settings)
129
- end
130
-
131
- # And for now, SILENCE SolrJ logging
132
- org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
133
- end
134
-
135
- # Method IS thread-safe, can be called concurrently by multi-threads.
136
- #
137
- # Why? If not using batched add, we just use the SolrServer, which is already
138
- # thread safe itself.
139
- #
140
- # If we are using batch add, we surround all access to our shared state batch queue
141
- # in a mutex -- just a naive implementation. May be able to improve performance
142
- # with more sophisticated java.util.concurrent data structure (blocking queue etc)
143
- # I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
144
- # mutex -- I did not see consistently different performance. May want to
145
- # change so doesn't use a mutex at all if multiple mapping threads aren't being
146
- # used.
147
- #
148
- # this class does not at present use any threads itself, all work will be done
149
- # in the calling thread, including actual http transactions to solr via solrj SolrServer
150
- # if using batches, then not every #put is a http transaction, but when it is,
151
- # it's in the calling thread, synchronously.
152
- def put(context)
153
- @thread_pool.raise_collected_exception!
154
-
155
- # package the SolrInputDocument along with the context, so we have
156
- # the context for error reporting when we actually add.
157
-
158
- package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
159
-
160
- if settings["solrj_writer.batch_size"].to_i > 1
161
- ready_batch = []
162
-
163
- batched_queue.add(package)
164
- if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
165
- batched_queue.drain_to(ready_batch)
166
- end
167
-
168
- if ready_batch.length > 0
169
- if @debug_ascii_progress
170
- $stderr.write("^")
171
- if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
172
- $stderr.write "!"
173
- end
174
- end
175
-
176
- @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
177
- end
178
- else # non-batched add, add one at a time.
179
- @thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
180
- end
181
- end
182
-
183
- def hash_to_solr_document(hash)
184
- doc = SolrInputDocument.new
185
- hash.each_pair do |key, value_array|
186
- value_array.each do |value|
187
- doc.addField( key, value )
188
- end
189
- end
190
- return doc
191
- end
192
-
193
- # Takes array and batch adds it to solr -- array of UpdatePackage tuples of
194
- # SolrInputDocument and context.
195
- #
196
- # Catches error in batch add, logs, and re-tries docs individually
197
- #
198
- # Is thread-safe, because SolrServer is thread-safe, and we aren't
199
- # referencing any other shared state. Important that CALLER passes
200
- # in a doc array that is not shared state, extracting it from
201
- # shared state batched_queue in a mutex.
202
- def batch_add_document_packages(current_batch)
203
- begin
204
- a = current_batch.collect {|package| package.solr_document }
205
- solr_server.add( a )
206
-
207
- $stderr.write "%" if @debug_ascii_progress
208
- rescue Exception => e
209
- # Error in batch, none of the docs got added, let's try to re-add
210
- # em all individually, so those that CAN get added get added, and those
211
- # that can't get individually logged.
212
- logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
213
- current_batch.each do |package|
214
- add_one_document_package(package)
215
- end
216
- end
217
- end
218
-
219
-
220
- # Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
221
- # and context.
222
- #
223
- # Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
224
- # again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
225
- # document add. If we get an exception on a multi-doc batch add, we need to recover
226
- # differently.
227
- def add_one_document_package(package)
228
- begin
229
- solr_server.add(package.solr_document)
230
- # Honestly not sure what the difference is between those types, but SolrJ raises both
231
- rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException => e
232
- id = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
233
- id_str = id ? "001:#{id}" : ""
234
-
235
- position = package.context.position
236
- position_str = position ? "at file position #{position} (starting at 1)" : ""
237
-
238
- logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
239
- logger.debug(package.context.source_record.to_s)
240
-
241
- @skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
242
-
243
- if fatal_exception? e
244
- logger.fatal ("SolrJ exception judged fatal, raising...")
245
- raise e
246
- end
247
- end
248
- end
249
-
250
- def logger
251
- settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
252
- end
253
-
254
- # If an exception is encountered talking to Solr, is it one we should
255
- # entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
256
- # we have to look into it's details and guess.
257
- def fatal_exception?(e)
258
-
259
-
260
- root_cause = e.respond_to?(:getRootCause) && e.getRootCause
261
-
262
- # Various kinds of inability to actually talk to the
263
- # server look like this:
264
- if root_cause.kind_of? java.io.IOException
265
- return true
266
- end
267
-
268
- # Consider Solr server returning HTTP 500 Internal Server Error to be fatal.
269
- # This can mean, for instance, that disk space is exhausted on solr server.
270
- if e.kind_of?(Java::OrgApacheSolrCommon::SolrException) && e.code == 500
271
- return true
272
- end
273
-
274
- return false
275
- end
276
-
277
- def close
278
- @thread_pool.raise_collected_exception!
279
-
280
- # Any leftovers in batch buffer? Send em to the threadpool too.
281
- if batched_queue.length > 0
282
- packages = []
283
- batched_queue.drain_to(packages)
284
-
285
- # we do it in the thread pool for consistency, and so
286
- # it goes to the end of the queue behind any outstanding
287
- # work in the pool.
288
- @thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
289
- end
290
-
291
- # Wait for shutdown, and time it.
292
- logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
293
- elapsed = @thread_pool.shutdown_and_wait
294
- if elapsed > 60
295
- logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
296
- end
297
- logger.debug "SolrJWriter: Thread pool shutdown complete"
298
- logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
299
-
300
- # check again now that we've waited, there could still be some
301
- # that didn't show up before.
302
- @thread_pool.raise_collected_exception!
303
-
304
- if settings["solrj_writer.commit_on_close"].to_s == "true"
305
- logger.info "SolrJWriter: Sending commit to solr..."
306
- solr_server.commit
307
- end
308
-
309
- solr_server.shutdown
310
- @solr_server = nil
311
- end
312
-
313
- # Return count of encountered skipped records. Most accurate to call
314
- # it after #close, in which case it should include full count, even
315
- # under async thread_pool.
316
- def skipped_record_count
317
- @skipped_record_incrementer.get
318
- end
319
-
320
-
321
- def solr_server
322
- @solr_server ||= instantiate_solr_server!
323
- end
324
- attr_writer :solr_server # mainly for testing
325
-
326
- # Instantiates a solr server of class settings["solrj_writer.server_class_name"] or "HttpSolrServer"
327
- # and initializes it with settings["solr.url"]
328
- def instantiate_solr_server!
329
- server_class = qualified_const_get( settings["solrj_writer.server_class_name"] || "HttpSolrServer" )
330
- server = server_class.new( settings["solr.url"].to_s );
331
-
332
- if parser_name = settings["solrj_writer.parser_class_name"]
333
- #parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
334
- parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
335
- server.setParser( parser )
336
- end
337
-
338
- server
339
- end
340
-
341
- def settings_check!(settings)
342
- unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
343
- raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
344
- end
345
-
346
- unless settings["solr.url"] =~ /^#{URI::regexp}$/
347
- raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
348
- end
349
- end
350
-
351
- end
@@ -1,136 +0,0 @@
1
- # Encoding: UTF-8
2
-
3
- require 'test_helper'
4
-
5
- require 'traject'
6
- require 'traject/indexer'
7
- require 'traject/marc4j_reader'
8
-
9
- require 'marc'
10
-
11
- describe "Marc4JReader" do
12
- it "reads Marc binary" do
13
- file = File.new(support_file_path("test_data.utf8.mrc"))
14
- settings = Traject::Indexer::Settings.new() # binary type is default
15
- reader = Traject::Marc4JReader.new(file, settings)
16
-
17
- array = reader.to_a
18
-
19
- assert_equal 30, array.length
20
- first = array.first
21
-
22
- assert_kind_of MARC::Record, first
23
- assert_equal first['245']['a'].encoding.name, "UTF-8"
24
- end
25
-
26
- it "can skip a bad subfield code" do
27
- file = File.new(support_file_path("bad_subfield_code.marc"))
28
- settings = Traject::Indexer::Settings.new() # binary type is default
29
- reader = Traject::Marc4JReader.new(file, settings)
30
-
31
- array = reader.to_a
32
-
33
- assert_equal 1, array.length
34
- assert_kind_of MARC::Record, array.first
35
- assert_length 2, array.first['260'].subfields
36
- end
37
-
38
- it "reads Marc binary in Marc8 encoding" do
39
- file = File.new(support_file_path("one-marc8.mrc"))
40
- settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8")
41
- reader = Traject::Marc4JReader.new(file, settings)
42
-
43
- array = reader.to_a
44
-
45
- assert_length 1, array
46
-
47
-
48
- assert_kind_of MARC::Record, array.first
49
- a245a = array.first['245']['a']
50
-
51
- assert a245a.encoding.name, "UTF-8"
52
- assert a245a.valid_encoding?
53
- # marc4j converts to denormalized unicode, bah. Although
54
- # it's legal, it probably looks weird as a string literal
55
- # below, depending on your editor.
56
- assert_equal "Por uma outra globalização :", a245a
57
-
58
- # Set leader byte to proper for unicode
59
- assert_equal 'a', array.first.leader[9]
60
- end
61
-
62
-
63
- it "reads XML" do
64
- file = File.new(support_file_path "test_data.utf8.marc.xml")
65
- settings = Traject::Indexer::Settings.new("marc_source.type" => "xml")
66
- reader = Traject::Marc4JReader.new(file, settings)
67
-
68
- array = reader.to_a
69
-
70
- assert_equal 30, array.length
71
-
72
- first = array.first
73
-
74
- assert_kind_of MARC::Record, first
75
- assert first['245']['a'].encoding.name, "UTF-8"
76
- assert_equal "Fikr-i Ayāz /", first['245']['a']
77
- end
78
-
79
- it "keeps marc4j object when asked" do
80
- file = File.new(support_file_path "test_data.utf8.marc.xml")
81
- settings = Traject::Indexer::Settings.new("marc_source.type" => "xml", 'marc4j_reader.keep_marc4j' => true)
82
- record = Traject::Marc4JReader.new(file, settings).to_a.first
83
- assert_kind_of MARC::Record, record
84
- assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j
85
- end
86
-
87
- it "replaces unicode character reference in Marc8 transcode" do
88
- file = File.new(support_file_path "escaped_character_reference.marc8.marc")
89
- # due to marc4j idiosyncracies, this test will NOT pass with default source_encoding
90
- # of "BESTGUESS", it only works if you explicitly set to MARC8. Doh.
91
- settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") # binary type is default
92
- record = Traject::Marc4JReader.new(file, settings).to_a.first
93
-
94
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
95
- end
96
-
97
- describe "Marc4J Java Permissive Stream Reader" do
98
- # needed for sanity check when our tests fail to see if Marc4J
99
- # is not behaving how we think it should.
100
- it "converts character references" do
101
- file = File.new(support_file_path "escaped_character_reference.marc8.marc")
102
- reader = MarcPermissiveStreamReader.new(file.to_inputstream, true, true, "MARC-8")
103
- record = reader.next
104
-
105
- field = record.getVariableField("260")
106
- subfield = field.getSubfield('a'.ord)
107
- value = subfield.getData
108
-
109
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", value
110
- end
111
- end
112
-
113
- it "replaces bad byte in UTF8 marc" do
114
- skip "Marc4J needs fixing on it's end" # Marc4J won't do this in 'permissive' mode, gah.
115
-
116
- # Note this only works because the marc file DOES correctly
117
- # have leader byte 9 set to 'a' for UTF8, otherwise Marc4J can't do it.
118
- file = File.new(support_file_path "bad_utf_byte.utf8.marc")
119
-
120
- settings = Traject::Indexer::Settings.new() # binary UTF8 type is default
121
- reader = Traject::Marc4JReader.new(file, settings)
122
-
123
- record = reader.to_a.first
124
-
125
- value = record['300']['a']
126
-
127
- assert_equal value.encoding.name, "UTF-8"
128
- assert value.valid_encoding?, "Has valid encoding"
129
- assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", record['300']['a']
130
- end
131
-
132
-
133
-
134
-
135
-
136
- end