traject 1.1.0 → 2.0.0.rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -1,351 +0,0 @@
1
- require 'yell'
2
-
3
- require 'traject'
4
- require 'traject/util'
5
- require 'traject/qualified_const_get'
6
- require 'traject/thread_pool'
7
-
8
- require 'uri'
9
- require 'thread' # for Mutex
10
-
11
- #
12
- # Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
13
- #
14
- # After you call #close, you can check #skipped_record_count if you want
15
- # for an integer count of skipped records.
16
- #
17
- # For fatal errors that raise... async processing with thread_pool means that
18
- # you may not get a raise immediately after calling #put, you may get it on
19
- # a FUTURE #put or #close. You should get it eventually though.
20
- #
21
- # ## Settings
22
- #
23
- # * solr.url: Your solr url (required)
24
- #
25
- # * solrj_writer.server_class_name: Defaults to "HttpSolrServer". You can specify
26
- # another Solr Server sub-class, but it has
27
- # to take a one-arg url constructor. Maybe
28
- # subclass this writer class and overwrite
29
- # instantiate_solr_server! otherwise
30
- #
31
- # * solrj.jar_dir: Custom directory containing all of the SolrJ jars. All
32
- # jars in this dir will be loaded. Otherwise,
33
- # we load our own packaged solrj jars. This setting
34
- # can't really be used differently in the same app instance,
35
- # since jars are loaded globally.
36
- #
37
- # * solrj_writer.parser_class_name: A String name of a class in package
38
- # org.apache.solr.client.solrj.impl,
39
- # we'll instantiate one with a zero-arg
40
- # constructor, and pass it as an arg to setParser on
41
- # the SolrServer instance, if present.
42
- # NOTE: For contacting a Solr 1.x server, with the
43
- # recent version of SolrJ used by default, set to
44
- # "XMLResponseParser"
45
- #
46
- # * solrj_writer.commit_on_close: If true (or string 'true'), send a commit to solr
47
- # at end of #process.
48
- #
49
- # * solrj_writer.batch_size: If non-nil and more than 1, send documents to
50
- # solr in batches of solrj_writer.batch_size. If nil/1,
51
- # however, an http transaction with solr will be done
52
- # per doc. DEFAULT to 100, which seems to be a sweet spot.
53
- #
54
- # * solrj_writer.thread_pool: Defaults to 1. A thread pool is used for submitting docs
55
- # to solr. Set to 0 or nil to disable threading. Set to 1,
56
- # there will still be a single bg thread doing the adds. For
57
- # very fast Solr servers and very fast indexing processes, may
58
- # make sense to increase this value to throw at Solr as fast as it
59
- # can catch.
60
- #
61
- # ## Example
62
- #
63
- # settings do
64
- # provide "writer_class_name", "Traject::SolrJWriter"
65
- #
66
- # # This is just regular ruby, so don't be afraid to have conditionals!
67
- # # Switch on hostname, for test and production server differences
68
- # if Socket.gethostname =~ /devhost/
69
- # provide "solr.url", "http://my.dev.machine:9033/catalog"
70
- # else
71
- # provide "solr.url", "http://my.production.machine:9033/catalog"
72
- # end
73
- #
74
- # provide "solrj_writer.parser_class_name", "BinaryResponseParser" # for Solr 4.x
75
- # # provide "solrj_writer.parser_class_name", "XMLResponseParser" # For solr 1.x or 3.x
76
- #
77
- # provide "solrj_writer.commit_on_close", "true"
78
- # end
79
- class Traject::SolrJWriter
80
- # just a tuple of a SolrInputDocument
81
- # and a Traject::Indexer::Context it came from
82
- class UpdatePackage
83
- attr_accessor :solr_document, :context
84
- def initialize(doc, ctx)
85
- self.solr_document = doc
86
- self.context = ctx
87
- end
88
- end
89
-
90
- include Traject::QualifiedConstGet
91
-
92
- attr_reader :settings
93
-
94
- attr_reader :batched_queue
95
-
96
- def initialize(argSettings)
97
- @settings = Traject::Indexer::Settings.new(argSettings)
98
- settings_check!(settings)
99
-
100
- ensure_solrj_loaded!
101
-
102
- solr_server # init
103
-
104
- @batched_queue = java.util.concurrent.LinkedBlockingQueue.new
105
-
106
- # when multi-threaded exceptions raised in threads are held here
107
- # we need a HIGH performance queue here to try and avoid slowing things down,
108
- # since we need to check it frequently.
109
- @async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
110
-
111
- # Store error count in an AtomicInteger, so multi threads can increment
112
- # it safely, if we're threaded.
113
- @skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
114
-
115
- # if our thread pool settings are 0, it'll just create a null threadpool that
116
- # executes in calling context.
117
- @thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
118
-
119
- @debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
120
-
121
- logger.info(" #{self.class.name} writing to '#{settings['solr.url']}'")
122
- end
123
-
124
- # Loads solrj if not already loaded. By loading all jars found
125
- # in settings["solrj.jar_dir"]
126
- def ensure_solrj_loaded!
127
- unless defined?(HttpSolrServer) && defined?(SolrInputDocument)
128
- Traject::Util.require_solrj_jars(settings)
129
- end
130
-
131
- # And for now, SILENCE SolrJ logging
132
- org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
133
- end
134
-
135
- # Method IS thread-safe, can be called concurrently by multi-threads.
136
- #
137
- # Why? If not using batched add, we just use the SolrServer, which is already
138
- # thread safe itself.
139
- #
140
- # If we are using batch add, we surround all access to our shared state batch queue
141
- # in a mutex -- just a naive implementation. May be able to improve performance
142
- # with more sophisticated java.util.concurrent data structure (blocking queue etc)
143
- # I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
144
- # mutex -- I did not see consistently different performance. May want to
145
- # change so doesn't use a mutex at all if multiple mapping threads aren't being
146
- # used.
147
- #
148
- # this class does not at present use any threads itself, all work will be done
149
- # in the calling thread, including actual http transactions to solr via solrj SolrServer
150
- # if using batches, then not every #put is a http transaction, but when it is,
151
- # it's in the calling thread, synchronously.
152
- def put(context)
153
- @thread_pool.raise_collected_exception!
154
-
155
- # package the SolrInputDocument along with the context, so we have
156
- # the context for error reporting when we actually add.
157
-
158
- package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
159
-
160
- if settings["solrj_writer.batch_size"].to_i > 1
161
- ready_batch = []
162
-
163
- batched_queue.add(package)
164
- if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
165
- batched_queue.drain_to(ready_batch)
166
- end
167
-
168
- if ready_batch.length > 0
169
- if @debug_ascii_progress
170
- $stderr.write("^")
171
- if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
172
- $stderr.write "!"
173
- end
174
- end
175
-
176
- @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
177
- end
178
- else # non-batched add, add one at a time.
179
- @thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
180
- end
181
- end
182
-
183
- def hash_to_solr_document(hash)
184
- doc = SolrInputDocument.new
185
- hash.each_pair do |key, value_array|
186
- value_array.each do |value|
187
- doc.addField( key, value )
188
- end
189
- end
190
- return doc
191
- end
192
-
193
- # Takes array and batch adds it to solr -- array of UpdatePackage tuples of
194
- # SolrInputDocument and context.
195
- #
196
- # Catches error in batch add, logs, and re-tries docs individually
197
- #
198
- # Is thread-safe, because SolrServer is thread-safe, and we aren't
199
- # referencing any other shared state. Important that CALLER passes
200
- # in a doc array that is not shared state, extracting it from
201
- # shared state batched_queue in a mutex.
202
- def batch_add_document_packages(current_batch)
203
- begin
204
- a = current_batch.collect {|package| package.solr_document }
205
- solr_server.add( a )
206
-
207
- $stderr.write "%" if @debug_ascii_progress
208
- rescue Exception => e
209
- # Error in batch, none of the docs got added, let's try to re-add
210
- # em all individually, so those that CAN get added get added, and those
211
- # that can't get individually logged.
212
- logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
213
- current_batch.each do |package|
214
- add_one_document_package(package)
215
- end
216
- end
217
- end
218
-
219
-
220
- # Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
221
- # and context.
222
- #
223
- # Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
224
- # again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
225
- # document add. If we get an exception on a multi-doc batch add, we need to recover
226
- # differently.
227
- def add_one_document_package(package)
228
- begin
229
- solr_server.add(package.solr_document)
230
- # Honestly not sure what the difference is between those types, but SolrJ raises both
231
- rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException => e
232
- id = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
233
- id_str = id ? "001:#{id}" : ""
234
-
235
- position = package.context.position
236
- position_str = position ? "at file position #{position} (starting at 1)" : ""
237
-
238
- logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
239
- logger.debug(package.context.source_record.to_s)
240
-
241
- @skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
242
-
243
- if fatal_exception? e
244
- logger.fatal ("SolrJ exception judged fatal, raising...")
245
- raise e
246
- end
247
- end
248
- end
249
-
250
- def logger
251
- settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
252
- end
253
-
254
- # If an exception is encountered talking to Solr, is it one we should
255
- # entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
256
- # we have to look into it's details and guess.
257
- def fatal_exception?(e)
258
-
259
-
260
- root_cause = e.respond_to?(:getRootCause) && e.getRootCause
261
-
262
- # Various kinds of inability to actually talk to the
263
- # server look like this:
264
- if root_cause.kind_of? java.io.IOException
265
- return true
266
- end
267
-
268
- # Consider Solr server returning HTTP 500 Internal Server Error to be fatal.
269
- # This can mean, for instance, that disk space is exhausted on solr server.
270
- if e.kind_of?(Java::OrgApacheSolrCommon::SolrException) && e.code == 500
271
- return true
272
- end
273
-
274
- return false
275
- end
276
-
277
- def close
278
- @thread_pool.raise_collected_exception!
279
-
280
- # Any leftovers in batch buffer? Send em to the threadpool too.
281
- if batched_queue.length > 0
282
- packages = []
283
- batched_queue.drain_to(packages)
284
-
285
- # we do it in the thread pool for consistency, and so
286
- # it goes to the end of the queue behind any outstanding
287
- # work in the pool.
288
- @thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
289
- end
290
-
291
- # Wait for shutdown, and time it.
292
- logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
293
- elapsed = @thread_pool.shutdown_and_wait
294
- if elapsed > 60
295
- logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
296
- end
297
- logger.debug "SolrJWriter: Thread pool shutdown complete"
298
- logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
299
-
300
- # check again now that we've waited, there could still be some
301
- # that didn't show up before.
302
- @thread_pool.raise_collected_exception!
303
-
304
- if settings["solrj_writer.commit_on_close"].to_s == "true"
305
- logger.info "SolrJWriter: Sending commit to solr..."
306
- solr_server.commit
307
- end
308
-
309
- solr_server.shutdown
310
- @solr_server = nil
311
- end
312
-
313
- # Return count of encountered skipped records. Most accurate to call
314
- # it after #close, in which case it should include full count, even
315
- # under async thread_pool.
316
- def skipped_record_count
317
- @skipped_record_incrementer.get
318
- end
319
-
320
-
321
- def solr_server
322
- @solr_server ||= instantiate_solr_server!
323
- end
324
- attr_writer :solr_server # mainly for testing
325
-
326
- # Instantiates a solr server of class settings["solrj_writer.server_class_name"] or "HttpSolrServer"
327
- # and initializes it with settings["solr.url"]
328
- def instantiate_solr_server!
329
- server_class = qualified_const_get( settings["solrj_writer.server_class_name"] || "HttpSolrServer" )
330
- server = server_class.new( settings["solr.url"].to_s );
331
-
332
- if parser_name = settings["solrj_writer.parser_class_name"]
333
- #parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
334
- parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
335
- server.setParser( parser )
336
- end
337
-
338
- server
339
- end
340
-
341
- def settings_check!(settings)
342
- unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
343
- raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
344
- end
345
-
346
- unless settings["solr.url"] =~ /^#{URI::regexp}$/
347
- raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
348
- end
349
- end
350
-
351
- end
@@ -1,136 +0,0 @@
1
- # Encoding: UTF-8
2
-
3
- require 'test_helper'
4
-
5
- require 'traject'
6
- require 'traject/indexer'
7
- require 'traject/marc4j_reader'
8
-
9
- require 'marc'
10
-
11
- describe "Marc4JReader" do
12
- it "reads Marc binary" do
13
- file = File.new(support_file_path("test_data.utf8.mrc"))
14
- settings = Traject::Indexer::Settings.new() # binary type is default
15
- reader = Traject::Marc4JReader.new(file, settings)
16
-
17
- array = reader.to_a
18
-
19
- assert_equal 30, array.length
20
- first = array.first
21
-
22
- assert_kind_of MARC::Record, first
23
- assert_equal first['245']['a'].encoding.name, "UTF-8"
24
- end
25
-
26
- it "can skip a bad subfield code" do
27
- file = File.new(support_file_path("bad_subfield_code.marc"))
28
- settings = Traject::Indexer::Settings.new() # binary type is default
29
- reader = Traject::Marc4JReader.new(file, settings)
30
-
31
- array = reader.to_a
32
-
33
- assert_equal 1, array.length
34
- assert_kind_of MARC::Record, array.first
35
- assert_length 2, array.first['260'].subfields
36
- end
37
-
38
- it "reads Marc binary in Marc8 encoding" do
39
- file = File.new(support_file_path("one-marc8.mrc"))
40
- settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8")
41
- reader = Traject::Marc4JReader.new(file, settings)
42
-
43
- array = reader.to_a
44
-
45
- assert_length 1, array
46
-
47
-
48
- assert_kind_of MARC::Record, array.first
49
- a245a = array.first['245']['a']
50
-
51
- assert a245a.encoding.name, "UTF-8"
52
- assert a245a.valid_encoding?
53
- # marc4j converts to denormalized unicode, bah. Although
54
- # it's legal, it probably looks weird as a string literal
55
- # below, depending on your editor.
56
- assert_equal "Por uma outra globalização :", a245a
57
-
58
- # Set leader byte to proper for unicode
59
- assert_equal 'a', array.first.leader[9]
60
- end
61
-
62
-
63
- it "reads XML" do
64
- file = File.new(support_file_path "test_data.utf8.marc.xml")
65
- settings = Traject::Indexer::Settings.new("marc_source.type" => "xml")
66
- reader = Traject::Marc4JReader.new(file, settings)
67
-
68
- array = reader.to_a
69
-
70
- assert_equal 30, array.length
71
-
72
- first = array.first
73
-
74
- assert_kind_of MARC::Record, first
75
- assert first['245']['a'].encoding.name, "UTF-8"
76
- assert_equal "Fikr-i Ayāz /", first['245']['a']
77
- end
78
-
79
- it "keeps marc4j object when asked" do
80
- file = File.new(support_file_path "test_data.utf8.marc.xml")
81
- settings = Traject::Indexer::Settings.new("marc_source.type" => "xml", 'marc4j_reader.keep_marc4j' => true)
82
- record = Traject::Marc4JReader.new(file, settings).to_a.first
83
- assert_kind_of MARC::Record, record
84
- assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j
85
- end
86
-
87
- it "replaces unicode character reference in Marc8 transcode" do
88
- file = File.new(support_file_path "escaped_character_reference.marc8.marc")
89
- # due to marc4j idiosyncracies, this test will NOT pass with default source_encoding
90
- # of "BESTGUESS", it only works if you explicitly set to MARC8. Doh.
91
- settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") # binary type is default
92
- record = Traject::Marc4JReader.new(file, settings).to_a.first
93
-
94
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
95
- end
96
-
97
- describe "Marc4J Java Permissive Stream Reader" do
98
- # needed for sanity check when our tests fail to see if Marc4J
99
- # is not behaving how we think it should.
100
- it "converts character references" do
101
- file = File.new(support_file_path "escaped_character_reference.marc8.marc")
102
- reader = MarcPermissiveStreamReader.new(file.to_inputstream, true, true, "MARC-8")
103
- record = reader.next
104
-
105
- field = record.getVariableField("260")
106
- subfield = field.getSubfield('a'.ord)
107
- value = subfield.getData
108
-
109
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", value
110
- end
111
- end
112
-
113
- it "replaces bad byte in UTF8 marc" do
114
- skip "Marc4J needs fixing on it's end" # Marc4J won't do this in 'permissive' mode, gah.
115
-
116
- # Note this only works because the marc file DOES correctly
117
- # have leader byte 9 set to 'a' for UTF8, otherwise Marc4J can't do it.
118
- file = File.new(support_file_path "bad_utf_byte.utf8.marc")
119
-
120
- settings = Traject::Indexer::Settings.new() # binary UTF8 type is default
121
- reader = Traject::Marc4JReader.new(file, settings)
122
-
123
- record = reader.to_a.first
124
-
125
- value = record['300']['a']
126
-
127
- assert_equal value.encoding.name, "UTF-8"
128
- assert value.valid_encoding?, "Has valid encoding"
129
- assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", record['300']['a']
130
- end
131
-
132
-
133
-
134
-
135
-
136
- end