traject 0.0.2 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/README.md +85 -61
- data/Rakefile +5 -0
- data/bin/traject +31 -3
- data/doc/settings.md +74 -13
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject/indexer/settings.rb +75 -0
- data/lib/traject/indexer.rb +255 -45
- data/lib/traject/json_writer.rb +4 -2
- data/lib/traject/macros/marc21.rb +18 -6
- data/lib/traject/macros/marc21_semantics.rb +405 -0
- data/lib/traject/macros/marc_format_classifier.rb +180 -0
- data/lib/traject/marc4j_reader.rb +160 -0
- data/lib/traject/marc_extractor.rb +33 -17
- data/lib/traject/marc_reader.rb +14 -11
- data/lib/traject/solrj_writer.rb +247 -9
- data/lib/traject/thread_pool.rb +154 -0
- data/lib/traject/translation_map.rb +46 -4
- data/lib/traject/util.rb +30 -0
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/indexer/each_record_test.rb +34 -0
- data/test/indexer/macros_marc21_semantics_test.rb +206 -0
- data/test/indexer/macros_marc21_test.rb +10 -1
- data/test/indexer/map_record_test.rb +78 -8
- data/test/indexer/read_write_test.rb +43 -10
- data/test/indexer/settings_test.rb +60 -4
- data/test/indexer/to_field_test.rb +39 -0
- data/test/marc4j_reader_test.rb +75 -0
- data/test/marc_extractor_test.rb +62 -0
- data/test/marc_format_classifier_test.rb +91 -0
- data/test/marc_reader_test.rb +12 -0
- data/test/solrj_writer_test.rb +146 -43
- data/test/test_helper.rb +50 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +153 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +8 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/traject.gemspec +1 -1
- data/vendor/marc4j/README.md +17 -0
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
- metadata +81 -2
@@ -0,0 +1,160 @@
|
|
1
|
+
require 'traject'
|
2
|
+
require 'marc'
|
3
|
+
|
4
|
+
# Uses Marc4J to read the marc records, but then translates them to
|
5
|
+
# ruby-marc before delivering them still, Marc4J is just inside the black
|
6
|
+
# box.
|
7
|
+
#
|
8
|
+
# But one way to get ability to transcode from Marc8. Records it delivers
|
9
|
+
# are ALWAYS in UTF8, will be transcoded if needed.
|
10
|
+
#
|
11
|
+
# Also hope it gives us some performance benefit.
|
12
|
+
#
|
13
|
+
# Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
|
14
|
+
# in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
|
15
|
+
# for xml.
|
16
|
+
#
|
17
|
+
# NOTE: If you aren't reading in binary records encoded in MARC8, you may
|
18
|
+
# find the pure-ruby Traject::MarcReader faster; the extra step to read
|
19
|
+
# Marc4J but translate to ruby MARC::Record adds some overhead.
|
20
|
+
#
|
21
|
+
# Settings:
|
22
|
+
#
|
23
|
+
# * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
|
24
|
+
#
|
25
|
+
# * marc4j_reader.permissive: default true, false to turn off permissive reading. Used as
|
26
|
+
# value to 'permissive' arg of MarcPermissiveStreamReader constructor.
|
27
|
+
# Only used for 'binary'
|
28
|
+
#
|
29
|
+
# * marc4j_reader.source_encoding: Only used for 'binary', otherwise always UTF-8.
|
30
|
+
# String of the values MarcPermissiveStreamReader accepts:
|
31
|
+
# * BESTGUESS (tries to use MARC leader and believe it, I think)
|
32
|
+
# * ISO8859_1
|
33
|
+
# * UTF-8
|
34
|
+
# * MARC8
|
35
|
+
# Default 'BESTGUESS', but marc records in the wild are so wrong here, recommend setting.
|
36
|
+
# (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
|
37
|
+
#
|
38
|
+
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
39
|
+
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
40
|
+
class Traject::Marc4JReader
|
41
|
+
include Enumerable
|
42
|
+
|
43
|
+
attr_reader :settings, :input_stream
|
44
|
+
|
45
|
+
def initialize(input_stream, settings)
|
46
|
+
@settings = Traject::Indexer::Settings.new settings
|
47
|
+
@input_stream = input_stream
|
48
|
+
|
49
|
+
ensure_marc4j_loaded!
|
50
|
+
end
|
51
|
+
|
52
|
+
# Loads solrj if not already loaded. By loading all jars found
|
53
|
+
# in settings["solrj.jar_dir"]
|
54
|
+
def ensure_marc4j_loaded!
|
55
|
+
unless defined?(MarcPermissiveStreamReader)
|
56
|
+
require 'java'
|
57
|
+
|
58
|
+
tries = 0
|
59
|
+
begin
|
60
|
+
tries += 1
|
61
|
+
java_import org.marc4j.MarcPermissiveStreamReader
|
62
|
+
java_import org.marc4j.MarcXmlReader
|
63
|
+
rescue NameError => e
|
64
|
+
# /Users/jrochkind/code/solrj-gem/lib"
|
65
|
+
|
66
|
+
include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
|
67
|
+
|
68
|
+
jardir = settings["marc4j_reader.jar_dir"] || include_jar_dir
|
69
|
+
Dir.glob("#{jardir}/*.jar") do |x|
|
70
|
+
require x
|
71
|
+
end
|
72
|
+
|
73
|
+
if tries > 1
|
74
|
+
raise LoadError.new("Can not find Marc4J java classes")
|
75
|
+
else
|
76
|
+
retry
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def internal_reader
|
83
|
+
@internal_reader ||= create_marc_reader!
|
84
|
+
end
|
85
|
+
|
86
|
+
def input_type
|
87
|
+
# maybe later add some guessing somehow
|
88
|
+
settings["marc_source.type"]
|
89
|
+
end
|
90
|
+
|
91
|
+
def create_marc_reader!
|
92
|
+
case input_type
|
93
|
+
when "binary"
|
94
|
+
permissive = settings["marc4j_reader.permissive"].to_s == "true"
|
95
|
+
|
96
|
+
# #to_inputstream turns our ruby IO into a Java InputStream
|
97
|
+
# third arg means 'convert to UTF-8, yes'
|
98
|
+
MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, settings["marc4j_reader.source_encoding"])
|
99
|
+
when "xml"
|
100
|
+
MarcXmlReader.new(input_stream.to_inputstream)
|
101
|
+
else
|
102
|
+
raise IllegalArgument.new("Unrecgonized marc_source.type: #{input_type}")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def each
|
107
|
+
while (internal_reader.hasNext)
|
108
|
+
begin
|
109
|
+
marc4j = internal_reader.next
|
110
|
+
rubymarc = convert_marc4j_to_rubymarc(marc4j)
|
111
|
+
rescue Exception =>e
|
112
|
+
msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
|
113
|
+
if marc4j
|
114
|
+
msg += "\n 001 id: #{marc4j.getControlNumber}"
|
115
|
+
end
|
116
|
+
msg += "\n #{Traject::Util.exception_to_log_message(e)}"
|
117
|
+
logger.fatal msg
|
118
|
+
raise e
|
119
|
+
end
|
120
|
+
|
121
|
+
yield rubymarc
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def logger
|
126
|
+
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
127
|
+
end
|
128
|
+
|
129
|
+
def convert_marc4j_to_rubymarc(marc4j)
|
130
|
+
rmarc = MARC::Record.new
|
131
|
+
rmarc.leader = marc4j.getLeader.marshal
|
132
|
+
|
133
|
+
marc4j.getControlFields.each do |marc4j_control|
|
134
|
+
rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
|
135
|
+
end
|
136
|
+
|
137
|
+
marc4j.getDataFields.each do |marc4j_data|
|
138
|
+
rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
|
139
|
+
|
140
|
+
marc4j_data.getSubfields.each do |subfield|
|
141
|
+
|
142
|
+
# We assume Marc21, skip corrupted data
|
143
|
+
# if subfield.getCode is more than 255, subsequent .chr
|
144
|
+
# would raise.
|
145
|
+
if subfield.getCode > 255
|
146
|
+
logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
|
147
|
+
next
|
148
|
+
end
|
149
|
+
|
150
|
+
rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
|
151
|
+
rdata.append rsubfield
|
152
|
+
end
|
153
|
+
|
154
|
+
rmarc.append rdata
|
155
|
+
end
|
156
|
+
|
157
|
+
return rmarc
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
@@ -26,11 +26,7 @@ module Traject
|
|
26
26
|
# Third arg is an optional options hash that will be passed as
|
27
27
|
# third arg of MarcExtractor constructor.
|
28
28
|
def self.extract_by_spec(marc_record, specification, options = {})
|
29
|
-
(raise
|
30
|
-
|
31
|
-
unless specification.kind_of? Hash
|
32
|
-
specification = self.parse_string_spec(specification)
|
33
|
-
end
|
29
|
+
(raise ArgumentError, "first argument must not be nil") if marc_record.nil?
|
34
30
|
|
35
31
|
Traject::MarcExtractor.new(marc_record, specification, options).extract
|
36
32
|
end
|
@@ -38,6 +34,10 @@ module Traject
|
|
38
34
|
# Take a hash that's the output of #parse_string_spec, return
|
39
35
|
# an array of strings extracted from a marc record accordingly
|
40
36
|
#
|
37
|
+
# Second arg can either be a string specification that will be passed
|
38
|
+
# to MarcExtractor.parse_string_spec, or a Hash that's
|
39
|
+
# already been created by it.
|
40
|
+
#
|
41
41
|
# options:
|
42
42
|
#
|
43
43
|
# [:seperator] default ' ' (space), what to use to seperate
|
@@ -47,16 +47,15 @@ module Traject
|
|
47
47
|
# that match spec. Also:
|
48
48
|
# * false => do not include.
|
49
49
|
# * :only => only include linked 880s, not original
|
50
|
-
def initialize(marc_record,
|
50
|
+
def initialize(marc_record, spec, options = {})
|
51
51
|
self.options = {
|
52
52
|
:seperator => ' ',
|
53
53
|
:alternate_script => :include
|
54
54
|
}.merge(options)
|
55
55
|
|
56
|
-
raise IllegalArgumentException("second arg to MarcExtractor.new must be a Hash specification object") unless spec_hash.kind_of? Hash
|
57
|
-
|
58
56
|
self.marc_record = marc_record
|
59
|
-
|
57
|
+
|
58
|
+
self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
|
60
59
|
end
|
61
60
|
|
62
61
|
# Converts from a string marc spec like "245abc:700a" to a nested hash used internally
|
@@ -129,7 +128,7 @@ module Traject
|
|
129
128
|
end
|
130
129
|
|
131
130
|
|
132
|
-
# Returns array of strings, extracted values
|
131
|
+
# Returns array of strings, extracted values. Maybe empty array.
|
133
132
|
def extract
|
134
133
|
results = []
|
135
134
|
|
@@ -145,26 +144,46 @@ module Traject
|
|
145
144
|
end
|
146
145
|
|
147
146
|
# Yields a block for every line in source record that matches
|
148
|
-
# spec. First arg to block is MARC::
|
147
|
+
# spec. First arg to block is MARC::DataField or ControlField, second
|
149
148
|
# is the hash specification that it matched on. May take account
|
150
149
|
# of options such as :alternate_script
|
150
|
+
#
|
151
|
+
# Third (optional) arg to block is self, the MarcExtractor object, useful for custom
|
152
|
+
# implementations.
|
151
153
|
def each_matching_line
|
152
154
|
self.marc_record.each do |field|
|
153
155
|
if (spec = spec_covering_field(field)) && matches_indicators(field, spec)
|
154
|
-
yield(field, spec)
|
156
|
+
yield(field, spec, self)
|
155
157
|
end
|
156
158
|
end
|
157
159
|
end
|
158
160
|
|
161
|
+
# line each_matching_line, takes a block to process each matching line,
|
162
|
+
# but collects results of block into an array -- flattens any subarrays for you!
|
163
|
+
#
|
164
|
+
# Useful for re-use of this class for custom processing
|
165
|
+
def collect_matching_lines
|
166
|
+
results = []
|
167
|
+
self.each_matching_line do |field, spec, extractor|
|
168
|
+
results.concat [yield(field, spec, extractor)].flatten
|
169
|
+
end
|
170
|
+
return results
|
171
|
+
end
|
172
|
+
|
173
|
+
|
159
174
|
# Pass in a marc data field and a hash spec, returns
|
160
175
|
# an ARRAY of one or more strings, subfields extracted
|
161
176
|
# and processed per spec. Takes account of options such
|
162
177
|
# as :seperator
|
178
|
+
#
|
179
|
+
# Always returns array, sometimes empty array.
|
163
180
|
def collect_subfields(field, spec)
|
164
181
|
subfields = field.subfields.collect do |subfield|
|
165
182
|
subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code)
|
166
183
|
end.compact
|
167
184
|
|
185
|
+
return subfields if subfields.empty? # empty array, just return it.
|
186
|
+
|
168
187
|
return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields
|
169
188
|
end
|
170
189
|
|
@@ -175,13 +194,10 @@ module Traject
|
|
175
194
|
# otherwise will always return nil for 880s, you have to handle :alternate_script :include
|
176
195
|
# elsewhere, to add in the 880 in the right order
|
177
196
|
def spec_covering_field(field)
|
178
|
-
|
179
|
-
#binding.pry if field.tag == "880"
|
180
|
-
|
181
|
-
if field.tag == "880" && options[:alternate_script] != false
|
197
|
+
if field.tag == "880" && field['6'] && options[:alternate_script] != false
|
182
198
|
# pull out the spec for corresponding original marc tag this 880 corresponds to
|
183
199
|
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
184
|
-
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
200
|
+
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
185
201
|
orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3)
|
186
202
|
field["6"] && self.spec_hash[ orig_field ]
|
187
203
|
elsif options[:alternate_script] != :only
|
data/lib/traject/marc_reader.rb
CHANGED
@@ -1,31 +1,34 @@
|
|
1
1
|
require 'marc'
|
2
2
|
|
3
3
|
# A Reader class that can be used with Traject::Indexer.reader, to read
|
4
|
-
# MARC records.
|
4
|
+
# MARC records.
|
5
5
|
#
|
6
|
-
# Includes Enumerable for convenience.
|
6
|
+
# Includes Enumerable for convenience.
|
7
7
|
#
|
8
8
|
# Reads in Marc records using ruby marc. Depends on config variables to
|
9
9
|
# determine what serialization type to expect, and other parameters controlling
|
10
|
-
# de-serialization.
|
10
|
+
# de-serialization.
|
11
|
+
#
|
12
|
+
# NOTE: MarcReader can not handle Marc8 encoding. If you need to read binary
|
13
|
+
# records in MARC8, use Traject::Marc4JReader instead.
|
11
14
|
#
|
12
15
|
# Settings:
|
13
16
|
# ["marc_source.type"] serialization type. default 'binary'
|
14
|
-
# * "binary". Actual marc.
|
17
|
+
# * "binary". Actual marc.
|
15
18
|
# * "xml", MarcXML
|
16
19
|
# * "json". (NOT YET IMPLEMENTED) The "marc-in-json" format, encoded as newline-seperated
|
17
20
|
# json. A simplistic newline-seperated json, with no comments
|
18
21
|
# allowed, and no unescpaed internal newlines allowed in the json
|
19
22
|
# objects -- we just read line by line, and assume each line is a
|
20
23
|
# marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
|
21
|
-
# ["
|
24
|
+
# ["marc_reader.xml_parser"] For XML type, which XML parser to tell Marc::Reader
|
22
25
|
# to use. Anything recognized by Marc::Reader :parser
|
23
26
|
# argument. By default, asks Marc::Reader to take
|
24
27
|
# it's best guess as to highest performance available
|
25
|
-
# installed option.
|
28
|
+
# installed option.
|
26
29
|
#
|
27
30
|
#
|
28
|
-
# Can NOT yet read Marc8, input is always assumed UTF8.
|
31
|
+
# Can NOT yet read Marc8, input is always assumed UTF8.
|
29
32
|
class Traject::MarcReader
|
30
33
|
include Enumerable
|
31
34
|
|
@@ -34,18 +37,18 @@ class Traject::MarcReader
|
|
34
37
|
@@best_xml_parser = MARC::XMLReader.best_available
|
35
38
|
|
36
39
|
def initialize(input_stream, settings)
|
37
|
-
@settings = settings
|
40
|
+
@settings = Traject::Indexer::Settings.new settings
|
38
41
|
@input_stream = input_stream
|
39
42
|
end
|
40
43
|
|
41
44
|
# Creates proper kind of ruby MARC reader, depending
|
42
45
|
# on settings or guesses.
|
43
46
|
def internal_reader
|
44
|
-
unless defined? @internal_reader
|
45
|
-
@internal_reader =
|
47
|
+
unless defined? @internal_reader
|
48
|
+
@internal_reader =
|
46
49
|
case settings["marc_source.type"]
|
47
50
|
when "xml"
|
48
|
-
parser = settings["
|
51
|
+
parser = settings["marc_reader.xml_parser"] || @@best_xml_parser
|
49
52
|
MARC::XMLReader.new(self.input_stream, :parser=> parser)
|
50
53
|
else
|
51
54
|
MARC::Reader.new(self.input_stream)
|
data/lib/traject/solrj_writer.rb
CHANGED
@@ -1,10 +1,40 @@
|
|
1
|
+
# TODO: THREAD POOL
|
2
|
+
#
|
3
|
+
# 1) Exception handling in threads, what's the right thing to do
|
4
|
+
# 2) General count of failed records in a thread safe way, so we can report
|
5
|
+
# it back from 'close', so process can report it back, and non-zero exit
|
6
|
+
# code can be emited from command-line.
|
7
|
+
# 3) back pressure on thread pool. give it a bounded blocking queue instead,
|
8
|
+
# to make sure thousands of add tasks don't build up, waiting until the end.
|
9
|
+
# or does that even matter? So what if they build up in the queue and only
|
10
|
+
# get taken care of at the end, is that okay? I do emit a warning right now
|
11
|
+
# if it takes more than 60 seconds to process remaining thread pool task queue
|
12
|
+
# at end.
|
13
|
+
# 4) No tests yet that actually test thread pool stuff; additionally, may make
|
14
|
+
# some of the batch tests fail in non-deterministic ways, since batch tests
|
15
|
+
# assume order of add (and our Mock solr server is not thread safe yet!)
|
16
|
+
|
17
|
+
require 'yell'
|
18
|
+
|
1
19
|
require 'traject'
|
20
|
+
require 'traject/util'
|
2
21
|
require 'traject/qualified_const_get'
|
22
|
+
require 'traject/thread_pool'
|
23
|
+
|
24
|
+
require 'uri'
|
25
|
+
require 'thread' # for Mutex
|
3
26
|
|
4
27
|
#
|
5
28
|
# Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
|
6
29
|
# (sub-class later for the ConcurrentUpdate server?)
|
7
30
|
#
|
31
|
+
# After you call #close, you can check #skipped_record_count if you want
|
32
|
+
# for an integer count of skipped records.
|
33
|
+
#
|
34
|
+
# For fatal errors that raise... async processing with thread_pool means that
|
35
|
+
# you may not get a raise immediately after calling #put, you may get it on
|
36
|
+
# a FUTURE #put or #close. You should get it eventually though.
|
37
|
+
#
|
8
38
|
# settings:
|
9
39
|
# [solr.url] Your solr url (required)
|
10
40
|
# [solrj_writer.server_class_name] Defaults to "HttpSolrServer". You can specify
|
@@ -27,18 +57,57 @@ require 'traject/qualified_const_get'
|
|
27
57
|
# "XMLResponseParser"
|
28
58
|
# [solrj_writer.commit_on_close] If true (or string 'true'), send a commit to solr
|
29
59
|
# at end of #process.
|
60
|
+
# [solrj_writer.batch_size] If non-nil and more than 1, send documents to
|
61
|
+
# solr in batches of solrj_writer.batch_size. If nil/1,
|
62
|
+
# however, an http transaction with solr will be done
|
63
|
+
# per doc. DEFAULT to 100, which seems to be a sweet spot.
|
64
|
+
# [solrj_writer.thread_pool] Defaults to 4. A thread pool is used for submitting docs
|
65
|
+
# to solr. Set to 0 or nil to disable threading. Set to 1,
|
66
|
+
# there will still be a single bg thread doing the adds.
|
67
|
+
# May make sense to set higher than number of cores on your
|
68
|
+
# indexing machine, as these threads will mostly be waiting
|
69
|
+
# on Solr. Speed/capacity of your solr is more relevant.
|
30
70
|
class Traject::SolrJWriter
|
71
|
+
# just a tuple of a SolrInputDocument
|
72
|
+
# and a Traject::Indexer::Context it came from
|
73
|
+
class UpdatePackage
|
74
|
+
attr_accessor :solr_document, :context
|
75
|
+
def initialize(doc, ctx)
|
76
|
+
self.solr_document = doc
|
77
|
+
self.context = ctx
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
31
81
|
include Traject::QualifiedConstGet
|
32
82
|
|
33
83
|
attr_reader :settings
|
34
84
|
|
85
|
+
attr_reader :batched_queue
|
86
|
+
|
35
87
|
def initialize(argSettings)
|
36
|
-
@settings = argSettings
|
88
|
+
@settings = Traject::Indexer::Settings.new(argSettings)
|
37
89
|
settings_check!(settings)
|
38
90
|
|
39
91
|
ensure_solrj_loaded!
|
40
92
|
|
41
93
|
solr_server # init
|
94
|
+
|
95
|
+
@batched_queue = java.util.concurrent.LinkedBlockingQueue.new
|
96
|
+
|
97
|
+
# when multi-threaded exceptions raised in threads are held here
|
98
|
+
# we need a HIGH performance queue here to try and avoid slowing things down,
|
99
|
+
# since we need to check it frequently.
|
100
|
+
@async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
|
101
|
+
|
102
|
+
# Store error count in an AtomicInteger, so multi threads can increment
|
103
|
+
# it safely, if we're threaded.
|
104
|
+
@skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
|
105
|
+
|
106
|
+
# if our thread pool settings are 0, it'll just create a null threadpool that
|
107
|
+
# executes in calling context.
|
108
|
+
@thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
|
109
|
+
|
110
|
+
@debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
|
42
111
|
end
|
43
112
|
|
44
113
|
# Loads solrj if not already loaded. By loading all jars found
|
@@ -68,29 +137,193 @@ class Traject::SolrJWriter
|
|
68
137
|
end
|
69
138
|
end
|
70
139
|
end
|
140
|
+
|
141
|
+
# And for now, SILENCE SolrJ logging
|
142
|
+
org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Method IS thread-safe, can be called concurrently by multi-threads.
|
146
|
+
#
|
147
|
+
# Why? If not using batched add, we just use the SolrServer, which is already
|
148
|
+
# thread safe itself.
|
149
|
+
#
|
150
|
+
# If we are using batch add, we surround all access to our shared state batch queue
|
151
|
+
# in a mutex -- just a naive implementation. May be able to improve performance
|
152
|
+
# with more sophisticated java.util.concurrent data structure (blocking queue etc)
|
153
|
+
# I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
|
154
|
+
# mutex -- I did not see consistently different performance. May want to
|
155
|
+
# change so doesn't use a mutex at all if multiple mapping threads aren't being
|
156
|
+
# used.
|
157
|
+
#
|
158
|
+
# this class does not at present use any threads itself, all work will be done
|
159
|
+
# in the calling thread, including actual http transactions to solr via solrj SolrServer
|
160
|
+
# if using batches, then not every #put is a http transaction, but when it is,
|
161
|
+
# it's in the calling thread, synchronously.
|
162
|
+
def put(context)
|
163
|
+
@thread_pool.raise_collected_exception!
|
164
|
+
|
165
|
+
# package the SolrInputDocument along with the context, so we have
|
166
|
+
# the context for error reporting when we actually add.
|
167
|
+
|
168
|
+
package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
|
169
|
+
|
170
|
+
if settings["solrj_writer.batch_size"].to_i > 1
|
171
|
+
ready_batch = []
|
172
|
+
|
173
|
+
# Synchronize access to our shared batched_queue state,
|
174
|
+
# but once we've pulled out what we want in local var
|
175
|
+
# `ready_batch`, don't need to synchronize anymore.
|
176
|
+
batched_queue.add(package)
|
177
|
+
if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
|
178
|
+
batched_queue.drain_to(ready_batch)
|
179
|
+
end
|
180
|
+
|
181
|
+
if ready_batch.length > 0
|
182
|
+
if @debug_ascii_progress
|
183
|
+
$stderr.write("^")
|
184
|
+
if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
|
185
|
+
$stderr.write "!"
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
@thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
|
190
|
+
end
|
191
|
+
else # non-batched add, add one at a time.
|
192
|
+
@thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
|
193
|
+
end
|
71
194
|
end
|
72
195
|
|
73
|
-
def
|
196
|
+
def hash_to_solr_document(hash)
|
74
197
|
doc = SolrInputDocument.new
|
75
|
-
|
76
198
|
hash.each_pair do |key, value_array|
|
77
199
|
value_array.each do |value|
|
78
200
|
doc.addField( key, value )
|
79
201
|
end
|
80
202
|
end
|
203
|
+
return doc
|
204
|
+
end
|
81
205
|
|
82
|
-
|
83
|
-
|
84
|
-
|
206
|
+
# Takes array and batch adds it to solr -- array of UpdatePackage tuples of
|
207
|
+
# SolrInputDocument and context.
|
208
|
+
#
|
209
|
+
# Catches error in batch add, logs, and re-tries docs individually
|
210
|
+
#
|
211
|
+
# Is thread-safe, because SolrServer is thread-safe, and we aren't
|
212
|
+
# referencing any other shared state. Important that CALLER passes
|
213
|
+
# in a doc array that is not shared state, extracting it from
|
214
|
+
# shared state batched_queue in a mutex.
|
215
|
+
def batch_add_document_packages(current_batch)
|
216
|
+
begin
|
217
|
+
a = current_batch.collect {|package| package.solr_document }
|
218
|
+
solr_server.add( a )
|
219
|
+
|
220
|
+
$stderr.write "%" if @debug_ascii_progress
|
221
|
+
rescue Exception => e
|
222
|
+
# Error in batch, none of the docs got added, let's try to re-add
|
223
|
+
# em all individually, so those that CAN get added get added, and those
|
224
|
+
# that can't get individually logged.
|
225
|
+
logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
|
226
|
+
current_batch.each do |package|
|
227
|
+
add_one_document_package(package)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
|
233
|
+
# Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
|
234
|
+
# and context.
|
235
|
+
#
|
236
|
+
# Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
|
237
|
+
# again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
|
238
|
+
# document add. If we get an exception on a multi-doc batch add, we need to recover
|
239
|
+
# differently.
|
240
|
+
def add_one_document_package(package)
|
241
|
+
begin
|
242
|
+
solr_server.add(package.solr_document)
|
243
|
+
# Honestly not sure what the difference is between those types, but SolrJ raises both
|
244
|
+
rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException => e
|
245
|
+
id = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
|
246
|
+
id_str = id ? "001:#{id}" : ""
|
247
|
+
|
248
|
+
position = package.context.position
|
249
|
+
position_str = position ? "at file position #{position} (starting at 1)" : ""
|
250
|
+
|
251
|
+
logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
|
252
|
+
logger.debug(package.context.source_record.to_s)
|
253
|
+
|
254
|
+
@skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
|
255
|
+
|
256
|
+
if fatal_exception? e
|
257
|
+
logger.fatal ("SolrJ exception judged fatal, raising...")
|
258
|
+
raise e
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def logger
|
264
|
+
settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
|
265
|
+
end
|
266
|
+
|
267
|
+
# If an exception is encountered talking to Solr, is it one we should
|
268
|
+
# entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
|
269
|
+
# we have to look into it's details and guess.
|
270
|
+
def fatal_exception?(e)
|
271
|
+
|
272
|
+
|
273
|
+
root_cause = e.respond_to?(:getRootCause) && e.getRootCause
|
274
|
+
|
275
|
+
# Various kinds of inability to actually talk to the
|
276
|
+
# server look like this:
|
277
|
+
if root_cause.kind_of? java.io.IOException
|
278
|
+
return true
|
279
|
+
end
|
280
|
+
|
281
|
+
return false
|
85
282
|
end
|
86
283
|
|
87
284
|
def close
|
88
|
-
|
285
|
+
@thread_pool.raise_collected_exception!
|
286
|
+
|
287
|
+
# Any leftovers in batch buffer? Send em to the threadpool too.
|
288
|
+
if batched_queue.length > 0
|
289
|
+
packages = []
|
290
|
+
batched_queue.drain_to(packages)
|
291
|
+
|
292
|
+
# we do it in the thread pool for consistency, and so
|
293
|
+
# it goes to the end of the queue behind any outstanding
|
294
|
+
# work in the pool.
|
295
|
+
@thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
|
296
|
+
end
|
297
|
+
|
298
|
+
# Wait for shutdown, and time it.
|
299
|
+
logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
|
300
|
+
elapsed = @thread_pool.shutdown_and_wait
|
301
|
+
if elapsed > 60
|
302
|
+
logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
|
303
|
+
end
|
304
|
+
logger.debug "SolrJWriter: Thread pool shutdown complete"
|
305
|
+
logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
306
|
+
|
307
|
+
# check again now that we've waited, there could still be some
|
308
|
+
# that didn't show up before.
|
309
|
+
@thread_pool.raise_collected_exception!
|
310
|
+
|
311
|
+
if settings["solrj_writer.commit_on_close"].to_s == "true"
|
312
|
+
logger.info "SolrJWriter: Sending commit to solr..."
|
313
|
+
solr_server.commit
|
314
|
+
end
|
89
315
|
|
90
316
|
solr_server.shutdown
|
91
317
|
@solr_server = nil
|
92
318
|
end
|
93
319
|
|
320
|
+
# Return count of encountered skipped records. Most accurate to call
|
321
|
+
# it after #close, in which case it should include full count, even
|
322
|
+
# under async thread_pool.
|
323
|
+
def skipped_record_count
|
324
|
+
@skipped_record_incrementer.get
|
325
|
+
end
|
326
|
+
|
94
327
|
|
95
328
|
def solr_server
|
96
329
|
@solr_server ||= instantiate_solr_server!
|
@@ -104,7 +337,8 @@ class Traject::SolrJWriter
|
|
104
337
|
server = server_class.new( settings["solr.url"].to_s );
|
105
338
|
|
106
339
|
if parser_name = settings["solrj_writer.parser_class_name"]
|
107
|
-
parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
|
340
|
+
#parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
|
341
|
+
parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
|
108
342
|
server.setParser( parser )
|
109
343
|
end
|
110
344
|
|
@@ -115,6 +349,10 @@ class Traject::SolrJWriter
|
|
115
349
|
unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
|
116
350
|
raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
|
117
351
|
end
|
352
|
+
|
353
|
+
unless settings["solr.url"] =~ /^#{URI::regexp}$/
|
354
|
+
raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
|
355
|
+
end
|
118
356
|
end
|
119
357
|
|
120
|
-
end
|
358
|
+
end
|