traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -0,0 +1,160 @@
1
+ require 'traject'
2
+ require 'marc'
3
+
4
+ # Uses Marc4J to read the marc records, but then translates them to
5
+ # ruby-marc before delivering them still, Marc4J is just inside the black
6
+ # box.
7
+ #
8
+ # But one way to get ability to transcode from Marc8. Records it delivers
9
+ # are ALWAYS in UTF8, will be transcoded if needed.
10
+ #
11
+ # Also hope it gives us some performance benefit.
12
+ #
13
+ # Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
14
+ # in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
15
+ # for xml.
16
+ #
17
+ # NOTE: If you aren't reading in binary records encoded in MARC8, you may
18
+ # find the pure-ruby Traject::MarcReader faster; the extra step to read
19
+ # Marc4J but translate to ruby MARC::Record adds some overhead.
20
+ #
21
+ # Settings:
22
+ #
23
+ # * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
24
+ #
25
+ # * marc4j_reader.permissive: default true, false to turn off permissive reading. Used as
26
+ # value to 'permissive' arg of MarcPermissiveStreamReader constructor.
27
+ # Only used for 'binary'
28
+ #
29
+ # * marc4j_reader.source_encoding: Only used for 'binary', otherwise always UTF-8.
30
+ # String of the values MarcPermissiveStreamReader accepts:
31
+ # * BESTGUESS (tries to use MARC leader and believe it, I think)
32
+ # * ISO8859_1
33
+ # * UTF-8
34
+ # * MARC8
35
+ # Default 'BESTGUESS', but marc records in the wild are so wrong here, recommend setting.
36
+ # (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
37
+ #
38
+ # * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
39
+ # be loaded. If unset, uses marc4j.jar bundled with traject.
40
+ class Traject::Marc4JReader
41
+ include Enumerable
42
+
43
+ attr_reader :settings, :input_stream
44
+
45
+ def initialize(input_stream, settings)
46
+ @settings = Traject::Indexer::Settings.new settings
47
+ @input_stream = input_stream
48
+
49
+ ensure_marc4j_loaded!
50
+ end
51
+
52
+ # Loads solrj if not already loaded. By loading all jars found
53
+ # in settings["solrj.jar_dir"]
54
+ def ensure_marc4j_loaded!
55
+ unless defined?(MarcPermissiveStreamReader)
56
+ require 'java'
57
+
58
+ tries = 0
59
+ begin
60
+ tries += 1
61
+ java_import org.marc4j.MarcPermissiveStreamReader
62
+ java_import org.marc4j.MarcXmlReader
63
+ rescue NameError => e
64
+ # /Users/jrochkind/code/solrj-gem/lib"
65
+
66
+ include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
67
+
68
+ jardir = settings["marc4j_reader.jar_dir"] || include_jar_dir
69
+ Dir.glob("#{jardir}/*.jar") do |x|
70
+ require x
71
+ end
72
+
73
+ if tries > 1
74
+ raise LoadError.new("Can not find Marc4J java classes")
75
+ else
76
+ retry
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ def internal_reader
83
+ @internal_reader ||= create_marc_reader!
84
+ end
85
+
86
+ def input_type
87
+ # maybe later add some guessing somehow
88
+ settings["marc_source.type"]
89
+ end
90
+
91
+ def create_marc_reader!
92
+ case input_type
93
+ when "binary"
94
+ permissive = settings["marc4j_reader.permissive"].to_s == "true"
95
+
96
+ # #to_inputstream turns our ruby IO into a Java InputStream
97
+ # third arg means 'convert to UTF-8, yes'
98
+ MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, settings["marc4j_reader.source_encoding"])
99
+ when "xml"
100
+ MarcXmlReader.new(input_stream.to_inputstream)
101
+ else
102
+ raise IllegalArgument.new("Unrecgonized marc_source.type: #{input_type}")
103
+ end
104
+ end
105
+
106
+ def each
107
+ while (internal_reader.hasNext)
108
+ begin
109
+ marc4j = internal_reader.next
110
+ rubymarc = convert_marc4j_to_rubymarc(marc4j)
111
+ rescue Exception =>e
112
+ msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
113
+ if marc4j
114
+ msg += "\n 001 id: #{marc4j.getControlNumber}"
115
+ end
116
+ msg += "\n #{Traject::Util.exception_to_log_message(e)}"
117
+ logger.fatal msg
118
+ raise e
119
+ end
120
+
121
+ yield rubymarc
122
+ end
123
+ end
124
+
125
+ def logger
126
+ @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
127
+ end
128
+
129
+ def convert_marc4j_to_rubymarc(marc4j)
130
+ rmarc = MARC::Record.new
131
+ rmarc.leader = marc4j.getLeader.marshal
132
+
133
+ marc4j.getControlFields.each do |marc4j_control|
134
+ rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
135
+ end
136
+
137
+ marc4j.getDataFields.each do |marc4j_data|
138
+ rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
139
+
140
+ marc4j_data.getSubfields.each do |subfield|
141
+
142
+ # We assume Marc21, skip corrupted data
143
+ # if subfield.getCode is more than 255, subsequent .chr
144
+ # would raise.
145
+ if subfield.getCode > 255
146
+ logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
147
+ next
148
+ end
149
+
150
+ rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
151
+ rdata.append rsubfield
152
+ end
153
+
154
+ rmarc.append rdata
155
+ end
156
+
157
+ return rmarc
158
+ end
159
+
160
+ end
@@ -26,11 +26,7 @@ module Traject
26
26
  # Third arg is an optional options hash that will be passed as
27
27
  # third arg of MarcExtractor constructor.
28
28
  def self.extract_by_spec(marc_record, specification, options = {})
29
- (raise IllegalArgument, "first argument must not be nil") if marc_record.nil?
30
-
31
- unless specification.kind_of? Hash
32
- specification = self.parse_string_spec(specification)
33
- end
29
+ (raise ArgumentError, "first argument must not be nil") if marc_record.nil?
34
30
 
35
31
  Traject::MarcExtractor.new(marc_record, specification, options).extract
36
32
  end
@@ -38,6 +34,10 @@ module Traject
38
34
  # Take a hash that's the output of #parse_string_spec, return
39
35
  # an array of strings extracted from a marc record accordingly
40
36
  #
37
+ # Second arg can either be a string specification that will be passed
38
+ # to MarcExtractor.parse_string_spec, or a Hash that's
39
+ # already been created by it.
40
+ #
41
41
  # options:
42
42
  #
43
43
  # [:seperator] default ' ' (space), what to use to seperate
@@ -47,16 +47,15 @@ module Traject
47
47
  # that match spec. Also:
48
48
  # * false => do not include.
49
49
  # * :only => only include linked 880s, not original
50
- def initialize(marc_record, spec_hash, options = {})
50
+ def initialize(marc_record, spec, options = {})
51
51
  self.options = {
52
52
  :seperator => ' ',
53
53
  :alternate_script => :include
54
54
  }.merge(options)
55
55
 
56
- raise IllegalArgumentException("second arg to MarcExtractor.new must be a Hash specification object") unless spec_hash.kind_of? Hash
57
-
58
56
  self.marc_record = marc_record
59
- self.spec_hash = spec_hash
57
+
58
+ self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
60
59
  end
61
60
 
62
61
  # Converts from a string marc spec like "245abc:700a" to a nested hash used internally
@@ -129,7 +128,7 @@ module Traject
129
128
  end
130
129
 
131
130
 
132
- # Returns array of strings, extracted values
131
+ # Returns array of strings, extracted values. Maybe empty array.
133
132
  def extract
134
133
  results = []
135
134
 
@@ -145,26 +144,46 @@ module Traject
145
144
  end
146
145
 
147
146
  # Yields a block for every line in source record that matches
148
- # spec. First arg to block is MARC::Field (control or data), second
147
+ # spec. First arg to block is MARC::DataField or ControlField, second
149
148
  # is the hash specification that it matched on. May take account
150
149
  # of options such as :alternate_script
150
+ #
151
+ # Third (optional) arg to block is self, the MarcExtractor object, useful for custom
152
+ # implementations.
151
153
  def each_matching_line
152
154
  self.marc_record.each do |field|
153
155
  if (spec = spec_covering_field(field)) && matches_indicators(field, spec)
154
- yield(field, spec)
156
+ yield(field, spec, self)
155
157
  end
156
158
  end
157
159
  end
158
160
 
161
+ # line each_matching_line, takes a block to process each matching line,
162
+ # but collects results of block into an array -- flattens any subarrays for you!
163
+ #
164
+ # Useful for re-use of this class for custom processing
165
+ def collect_matching_lines
166
+ results = []
167
+ self.each_matching_line do |field, spec, extractor|
168
+ results.concat [yield(field, spec, extractor)].flatten
169
+ end
170
+ return results
171
+ end
172
+
173
+
159
174
  # Pass in a marc data field and a hash spec, returns
160
175
  # an ARRAY of one or more strings, subfields extracted
161
176
  # and processed per spec. Takes account of options such
162
177
  # as :seperator
178
+ #
179
+ # Always returns array, sometimes empty array.
163
180
  def collect_subfields(field, spec)
164
181
  subfields = field.subfields.collect do |subfield|
165
182
  subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code)
166
183
  end.compact
167
184
 
185
+ return subfields if subfields.empty? # empty array, just return it.
186
+
168
187
  return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields
169
188
  end
170
189
 
@@ -175,13 +194,10 @@ module Traject
175
194
  # otherwise will always return nil for 880s, you have to handle :alternate_script :include
176
195
  # elsewhere, to add in the 880 in the right order
177
196
  def spec_covering_field(field)
178
- #require 'pry'
179
- #binding.pry if field.tag == "880"
180
-
181
- if field.tag == "880" && options[:alternate_script] != false
197
+ if field.tag == "880" && field['6'] && options[:alternate_script] != false
182
198
  # pull out the spec for corresponding original marc tag this 880 corresponds to
183
199
  # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
184
- # to do this weird encode gymnastics, which fixes it for mysterious reasons.
200
+ # to do this weird encode gymnastics, which fixes it for mysterious reasons.
185
201
  orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3)
186
202
  field["6"] && self.spec_hash[ orig_field ]
187
203
  elsif options[:alternate_script] != :only
@@ -1,31 +1,34 @@
1
1
  require 'marc'
2
2
 
3
3
  # A Reader class that can be used with Traject::Indexer.reader, to read
4
- # MARC records.
4
+ # MARC records.
5
5
  #
6
- # Includes Enumerable for convenience.
6
+ # Includes Enumerable for convenience.
7
7
  #
8
8
  # Reads in Marc records using ruby marc. Depends on config variables to
9
9
  # determine what serialization type to expect, and other parameters controlling
10
- # de-serialization.
10
+ # de-serialization.
11
+ #
12
+ # NOTE: MarcReader can not handle Marc8 encoding. If you need to read binary
13
+ # records in MARC8, use Traject::Marc4JReader instead.
11
14
  #
12
15
  # Settings:
13
16
  # ["marc_source.type"] serialization type. default 'binary'
14
- # * "binary". Actual marc.
17
+ # * "binary". Actual marc.
15
18
  # * "xml", MarcXML
16
19
  # * "json". (NOT YET IMPLEMENTED) The "marc-in-json" format, encoded as newline-seperated
17
20
  # json. A simplistic newline-seperated json, with no comments
18
21
  # allowed, and no unescpaed internal newlines allowed in the json
19
22
  # objects -- we just read line by line, and assume each line is a
20
23
  # marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
21
- # ["marc_source.xml_parser"] For XML type, which XML parser to tell Marc::Reader
24
+ # ["marc_reader.xml_parser"] For XML type, which XML parser to tell Marc::Reader
22
25
  # to use. Anything recognized by Marc::Reader :parser
23
26
  # argument. By default, asks Marc::Reader to take
24
27
  # it's best guess as to highest performance available
25
- # installed option.
28
+ # installed option.
26
29
  #
27
30
  #
28
- # Can NOT yet read Marc8, input is always assumed UTF8.
31
+ # Can NOT yet read Marc8, input is always assumed UTF8.
29
32
  class Traject::MarcReader
30
33
  include Enumerable
31
34
 
@@ -34,18 +37,18 @@ class Traject::MarcReader
34
37
  @@best_xml_parser = MARC::XMLReader.best_available
35
38
 
36
39
  def initialize(input_stream, settings)
37
- @settings = settings
40
+ @settings = Traject::Indexer::Settings.new settings
38
41
  @input_stream = input_stream
39
42
  end
40
43
 
41
44
  # Creates proper kind of ruby MARC reader, depending
42
45
  # on settings or guesses.
43
46
  def internal_reader
44
- unless defined? @internal_reader
45
- @internal_reader =
47
+ unless defined? @internal_reader
48
+ @internal_reader =
46
49
  case settings["marc_source.type"]
47
50
  when "xml"
48
- parser = settings["marc_source.xml_parser"] || @@best_xml_parser
51
+ parser = settings["marc_reader.xml_parser"] || @@best_xml_parser
49
52
  MARC::XMLReader.new(self.input_stream, :parser=> parser)
50
53
  else
51
54
  MARC::Reader.new(self.input_stream)
@@ -1,10 +1,40 @@
1
+ # TODO: THREAD POOL
2
+ #
3
+ # 1) Exception handling in threads, what's the right thing to do
4
+ # 2) General count of failed records in a thread safe way, so we can report
5
+ # it back from 'close', so process can report it back, and non-zero exit
6
+ # code can be emited from command-line.
7
+ # 3) back pressure on thread pool. give it a bounded blocking queue instead,
8
+ # to make sure thousands of add tasks don't build up, waiting until the end.
9
+ # or does that even matter? So what if they build up in the queue and only
10
+ # get taken care of at the end, is that okay? I do emit a warning right now
11
+ # if it takes more than 60 seconds to process remaining thread pool task queue
12
+ # at end.
13
+ # 4) No tests yet that actually test thread pool stuff; additionally, may make
14
+ # some of the batch tests fail in non-deterministic ways, since batch tests
15
+ # assume order of add (and our Mock solr server is not thread safe yet!)
16
+
17
+ require 'yell'
18
+
1
19
  require 'traject'
20
+ require 'traject/util'
2
21
  require 'traject/qualified_const_get'
22
+ require 'traject/thread_pool'
23
+
24
+ require 'uri'
25
+ require 'thread' # for Mutex
3
26
 
4
27
  #
5
28
  # Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
6
29
  # (sub-class later for the ConcurrentUpdate server?)
7
30
  #
31
+ # After you call #close, you can check #skipped_record_count if you want
32
+ # for an integer count of skipped records.
33
+ #
34
+ # For fatal errors that raise... async processing with thread_pool means that
35
+ # you may not get a raise immediately after calling #put, you may get it on
36
+ # a FUTURE #put or #close. You should get it eventually though.
37
+ #
8
38
  # settings:
9
39
  # [solr.url] Your solr url (required)
10
40
  # [solrj_writer.server_class_name] Defaults to "HttpSolrServer". You can specify
@@ -27,18 +57,57 @@ require 'traject/qualified_const_get'
27
57
  # "XMLResponseParser"
28
58
  # [solrj_writer.commit_on_close] If true (or string 'true'), send a commit to solr
29
59
  # at end of #process.
60
+ # [solrj_writer.batch_size] If non-nil and more than 1, send documents to
61
+ # solr in batches of solrj_writer.batch_size. If nil/1,
62
+ # however, an http transaction with solr will be done
63
+ # per doc. DEFAULT to 100, which seems to be a sweet spot.
64
+ # [solrj_writer.thread_pool] Defaults to 4. A thread pool is used for submitting docs
65
+ # to solr. Set to 0 or nil to disable threading. Set to 1,
66
+ # there will still be a single bg thread doing the adds.
67
+ # May make sense to set higher than number of cores on your
68
+ # indexing machine, as these threads will mostly be waiting
69
+ # on Solr. Speed/capacity of your solr is more relevant.
30
70
  class Traject::SolrJWriter
71
+ # just a tuple of a SolrInputDocument
72
+ # and a Traject::Indexer::Context it came from
73
+ class UpdatePackage
74
+ attr_accessor :solr_document, :context
75
+ def initialize(doc, ctx)
76
+ self.solr_document = doc
77
+ self.context = ctx
78
+ end
79
+ end
80
+
31
81
  include Traject::QualifiedConstGet
32
82
 
33
83
  attr_reader :settings
34
84
 
85
+ attr_reader :batched_queue
86
+
35
87
  def initialize(argSettings)
36
- @settings = argSettings
88
+ @settings = Traject::Indexer::Settings.new(argSettings)
37
89
  settings_check!(settings)
38
90
 
39
91
  ensure_solrj_loaded!
40
92
 
41
93
  solr_server # init
94
+
95
+ @batched_queue = java.util.concurrent.LinkedBlockingQueue.new
96
+
97
+ # when multi-threaded exceptions raised in threads are held here
98
+ # we need a HIGH performance queue here to try and avoid slowing things down,
99
+ # since we need to check it frequently.
100
+ @async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
101
+
102
+ # Store error count in an AtomicInteger, so multi threads can increment
103
+ # it safely, if we're threaded.
104
+ @skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
105
+
106
+ # if our thread pool settings are 0, it'll just create a null threadpool that
107
+ # executes in calling context.
108
+ @thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
109
+
110
+ @debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
42
111
  end
43
112
 
44
113
  # Loads solrj if not already loaded. By loading all jars found
@@ -68,29 +137,193 @@ class Traject::SolrJWriter
68
137
  end
69
138
  end
70
139
  end
140
+
141
+ # And for now, SILENCE SolrJ logging
142
+ org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
143
+ end
144
+
145
+ # Method IS thread-safe, can be called concurrently by multi-threads.
146
+ #
147
+ # Why? If not using batched add, we just use the SolrServer, which is already
148
+ # thread safe itself.
149
+ #
150
+ # If we are using batch add, we surround all access to our shared state batch queue
151
+ # in a mutex -- just a naive implementation. May be able to improve performance
152
+ # with more sophisticated java.util.concurrent data structure (blocking queue etc)
153
+ # I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
154
+ # mutex -- I did not see consistently different performance. May want to
155
+ # change so doesn't use a mutex at all if multiple mapping threads aren't being
156
+ # used.
157
+ #
158
+ # this class does not at present use any threads itself, all work will be done
159
+ # in the calling thread, including actual http transactions to solr via solrj SolrServer
160
+ # if using batches, then not every #put is a http transaction, but when it is,
161
+ # it's in the calling thread, synchronously.
162
+ def put(context)
163
+ @thread_pool.raise_collected_exception!
164
+
165
+ # package the SolrInputDocument along with the context, so we have
166
+ # the context for error reporting when we actually add.
167
+
168
+ package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
169
+
170
+ if settings["solrj_writer.batch_size"].to_i > 1
171
+ ready_batch = []
172
+
173
+ # Synchronize access to our shared batched_queue state,
174
+ # but once we've pulled out what we want in local var
175
+ # `ready_batch`, don't need to synchronize anymore.
176
+ batched_queue.add(package)
177
+ if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
178
+ batched_queue.drain_to(ready_batch)
179
+ end
180
+
181
+ if ready_batch.length > 0
182
+ if @debug_ascii_progress
183
+ $stderr.write("^")
184
+ if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
185
+ $stderr.write "!"
186
+ end
187
+ end
188
+
189
+ @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
190
+ end
191
+ else # non-batched add, add one at a time.
192
+ @thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
193
+ end
71
194
  end
72
195
 
73
- def put(hash)
196
+ def hash_to_solr_document(hash)
74
197
  doc = SolrInputDocument.new
75
-
76
198
  hash.each_pair do |key, value_array|
77
199
  value_array.each do |value|
78
200
  doc.addField( key, value )
79
201
  end
80
202
  end
203
+ return doc
204
+ end
81
205
 
82
- # TODO: Buffer docs internally, add in arrays, one http
83
- # transaction per array. Is what solrj wiki recommends.
84
- solr_server.add(doc)
206
+ # Takes array and batch adds it to solr -- array of UpdatePackage tuples of
207
+ # SolrInputDocument and context.
208
+ #
209
+ # Catches error in batch add, logs, and re-tries docs individually
210
+ #
211
+ # Is thread-safe, because SolrServer is thread-safe, and we aren't
212
+ # referencing any other shared state. Important that CALLER passes
213
+ # in a doc array that is not shared state, extracting it from
214
+ # shared state batched_queue in a mutex.
215
+ def batch_add_document_packages(current_batch)
216
+ begin
217
+ a = current_batch.collect {|package| package.solr_document }
218
+ solr_server.add( a )
219
+
220
+ $stderr.write "%" if @debug_ascii_progress
221
+ rescue Exception => e
222
+ # Error in batch, none of the docs got added, let's try to re-add
223
+ # em all individually, so those that CAN get added get added, and those
224
+ # that can't get individually logged.
225
+ logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
226
+ current_batch.each do |package|
227
+ add_one_document_package(package)
228
+ end
229
+ end
230
+ end
231
+
232
+
233
+ # Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
234
+ # and context.
235
+ #
236
+ # Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
237
+ # again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
238
+ # document add. If we get an exception on a multi-doc batch add, we need to recover
239
+ # differently.
240
+ def add_one_document_package(package)
241
+ begin
242
+ solr_server.add(package.solr_document)
243
+ # Honestly not sure what the difference is between those types, but SolrJ raises both
244
+ rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException => e
245
+ id = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
246
+ id_str = id ? "001:#{id}" : ""
247
+
248
+ position = package.context.position
249
+ position_str = position ? "at file position #{position} (starting at 1)" : ""
250
+
251
+ logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
252
+ logger.debug(package.context.source_record.to_s)
253
+
254
+ @skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
255
+
256
+ if fatal_exception? e
257
+ logger.fatal ("SolrJ exception judged fatal, raising...")
258
+ raise e
259
+ end
260
+ end
261
+ end
262
+
263
+ def logger
264
+ settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
265
+ end
266
+
267
+ # If an exception is encountered talking to Solr, is it one we should
268
+ # entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
269
+ # we have to look into it's details and guess.
270
+ def fatal_exception?(e)
271
+
272
+
273
+ root_cause = e.respond_to?(:getRootCause) && e.getRootCause
274
+
275
+ # Various kinds of inability to actually talk to the
276
+ # server look like this:
277
+ if root_cause.kind_of? java.io.IOException
278
+ return true
279
+ end
280
+
281
+ return false
85
282
  end
86
283
 
87
284
  def close
88
- solr_server.commit if settings["solrj_writer.commit_on_close"].to_s == "true"
285
+ @thread_pool.raise_collected_exception!
286
+
287
+ # Any leftovers in batch buffer? Send em to the threadpool too.
288
+ if batched_queue.length > 0
289
+ packages = []
290
+ batched_queue.drain_to(packages)
291
+
292
+ # we do it in the thread pool for consistency, and so
293
+ # it goes to the end of the queue behind any outstanding
294
+ # work in the pool.
295
+ @thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
296
+ end
297
+
298
+ # Wait for shutdown, and time it.
299
+ logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
300
+ elapsed = @thread_pool.shutdown_and_wait
301
+ if elapsed > 60
302
+ logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
303
+ end
304
+ logger.debug "SolrJWriter: Thread pool shutdown complete"
305
+ logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
306
+
307
+ # check again now that we've waited, there could still be some
308
+ # that didn't show up before.
309
+ @thread_pool.raise_collected_exception!
310
+
311
+ if settings["solrj_writer.commit_on_close"].to_s == "true"
312
+ logger.info "SolrJWriter: Sending commit to solr..."
313
+ solr_server.commit
314
+ end
89
315
 
90
316
  solr_server.shutdown
91
317
  @solr_server = nil
92
318
  end
93
319
 
320
+ # Return count of encountered skipped records. Most accurate to call
321
+ # it after #close, in which case it should include full count, even
322
+ # under async thread_pool.
323
+ def skipped_record_count
324
+ @skipped_record_incrementer.get
325
+ end
326
+
94
327
 
95
328
  def solr_server
96
329
  @solr_server ||= instantiate_solr_server!
@@ -104,7 +337,8 @@ class Traject::SolrJWriter
104
337
  server = server_class.new( settings["solr.url"].to_s );
105
338
 
106
339
  if parser_name = settings["solrj_writer.parser_class_name"]
107
- parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
340
+ #parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
341
+ parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
108
342
  server.setParser( parser )
109
343
  end
110
344
 
@@ -115,6 +349,10 @@ class Traject::SolrJWriter
115
349
  unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
116
350
  raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
117
351
  end
352
+
353
+ unless settings["solr.url"] =~ /^#{URI::regexp}$/
354
+ raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
355
+ end
118
356
  end
119
357
 
120
- end
358
+ end