traject 3.0.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -4
- data/CHANGES.md +65 -0
- data/README.md +9 -4
- data/doc/indexing_rules.md +5 -6
- data/doc/programmatic_use.md +25 -1
- data/doc/settings.md +4 -0
- data/doc/xml.md +12 -0
- data/lib/traject/indexer.rb +40 -4
- data/lib/traject/indexer/context.rb +45 -0
- data/lib/traject/indexer/step.rb +8 -12
- data/lib/traject/line_writer.rb +36 -4
- data/lib/traject/macros/marc21.rb +2 -2
- data/lib/traject/macros/marc21_semantics.rb +15 -12
- data/lib/traject/macros/nokogiri_macros.rb +9 -3
- data/lib/traject/nokogiri_reader.rb +17 -19
- data/lib/traject/oai_pmh_nokogiri_reader.rb +9 -3
- data/lib/traject/solr_json_writer.rb +167 -29
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_languages.yaml +77 -48
- data/test/delimited_writer_test.rb +14 -16
- data/test/indexer/class_level_configuration_test.rb +127 -0
- data/test/indexer/context_test.rb +64 -1
- data/test/indexer/error_handler_test.rb +18 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +4 -0
- data/test/indexer/nokogiri_indexer_test.rb +35 -0
- data/test/nokogiri_reader_test.rb +66 -3
- data/test/solr_json_writer_test.rb +175 -7
- data/test/test_support/date_resort_to_264.marc +1 -0
- data/traject.gemspec +4 -4
- metadata +37 -16
@@ -42,11 +42,11 @@ module Traject::Macros
|
|
42
42
|
#
|
43
43
|
# * :translation_map => String: translate with named translation map looked up in load
|
44
44
|
# path, uses Tranject::TranslationMap.new(translation_map_arg).
|
45
|
-
# **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
|
45
|
+
# **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)`
|
46
46
|
#
|
47
47
|
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
48
48
|
# have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
|
49
|
-
# `extract_marc(whatever), trim_punctuation
|
49
|
+
# `extract_marc(whatever), trim_punctuation`
|
50
50
|
#
|
51
51
|
# * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
|
52
52
|
#
|
@@ -26,19 +26,19 @@ module Traject::Macros
|
|
26
26
|
accumulator.concat list.uniq if list
|
27
27
|
end
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
# If a num begins with a known OCLC prefix, return it without the prefix.
|
31
31
|
# otherwise nil.
|
32
32
|
#
|
33
|
-
# Allow (OCoLC) and/or ocn/ocm/on
|
34
|
-
|
33
|
+
# Allow (OCoLC) and/or ocn/ocm/on
|
34
|
+
|
35
35
|
OCLCPAT = /
|
36
36
|
\A\s*
|
37
37
|
(?:(?:\(OCoLC\)) |
|
38
38
|
(?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on))
|
39
39
|
)(\d+)
|
40
40
|
/x
|
41
|
-
|
41
|
+
|
42
42
|
def self.oclcnum_extract(num)
|
43
43
|
if m = OCLCPAT.match(num)
|
44
44
|
return m[1]
|
@@ -364,13 +364,16 @@ module Traject::Macros
|
|
364
364
|
end
|
365
365
|
end
|
366
366
|
end
|
367
|
-
# Okay, nothing from 008, try 260
|
367
|
+
# Okay, nothing from 008, first try 264, then try 260
|
368
368
|
if found_date.nil?
|
369
|
+
v264c = MarcExtractor.cached("264c", :separator => nil).extract(record).first
|
369
370
|
v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
|
370
371
|
# just try to take the first four digits out of there, we're not going to try
|
371
372
|
# anything crazy.
|
372
|
-
if m = /(\d{4})/.match(
|
373
|
+
if m = /(\d{4})/.match(v264c)
|
373
374
|
found_date = m[1].to_i
|
375
|
+
elsif m = /(\d{4})/.match(v260c)
|
376
|
+
found_date = m[1].to_i
|
374
377
|
end
|
375
378
|
end
|
376
379
|
|
@@ -519,11 +522,11 @@ module Traject::Macros
|
|
519
522
|
|
520
523
|
# Extracts LCSH-carrying fields, and formatting them
|
521
524
|
# as a pre-coordinated LCSH string, for instance suitable for including
|
522
|
-
# in a facet.
|
525
|
+
# in a facet.
|
523
526
|
#
|
524
527
|
# You can supply your own list of fields as a spec, but for significant
|
525
528
|
# customization you probably just want to write your own method in
|
526
|
-
# terms of the Marc21Semantics.assemble_lcsh method.
|
529
|
+
# terms of the Marc21Semantics.assemble_lcsh method.
|
527
530
|
def marc_lcsh_formatted(options = {})
|
528
531
|
spec = options[:spec] || "600:610:611:630:648:650:651:654:662"
|
529
532
|
subd_separator = options[:subdivison_separator] || " — "
|
@@ -540,17 +543,17 @@ module Traject::Macros
|
|
540
543
|
end
|
541
544
|
|
542
545
|
# Takes a MARC::Field and formats it into a pre-coordinated LCSH string
|
543
|
-
# with subdivision seperators in the right place.
|
546
|
+
# with subdivision seperators in the right place.
|
544
547
|
#
|
545
548
|
# For 600 fields especially, need to not just join with subdivision seperator
|
546
549
|
# to take acount of $a$d$t -- for other fields, might be able to just
|
547
|
-
# join subfields, not sure.
|
550
|
+
# join subfields, not sure.
|
548
551
|
#
|
549
552
|
# WILL strip trailing period from generated string, contrary to some LCSH practice.
|
550
553
|
# Our data is inconsistent on whether it has period or not, this was
|
551
|
-
# the easiest way to standardize.
|
554
|
+
# the easiest way to standardize.
|
552
555
|
#
|
553
|
-
# Default subdivision seperator is em-dash with spaces, set to '--' if you want.
|
556
|
+
# Default subdivision seperator is em-dash with spaces, set to '--' if you want.
|
554
557
|
#
|
555
558
|
# Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
|
556
559
|
# is not carried in the MARC record. It may be system generated as a display constant
|
@@ -26,9 +26,15 @@ module Traject
|
|
26
26
|
# Make sure to avoid text content that was all blank, which is "between the children"
|
27
27
|
# whitespace.
|
28
28
|
result = result.collect do |n|
|
29
|
-
n.
|
30
|
-
|
31
|
-
|
29
|
+
if n.kind_of?(Nokogiri::XML::Attr)
|
30
|
+
# attribute value
|
31
|
+
n.value
|
32
|
+
else
|
33
|
+
# text from node
|
34
|
+
n.xpath('.//text()').collect(&:text).tap do |arr|
|
35
|
+
arr.reject! { |s| s =~ (/\A\s+\z/) }
|
36
|
+
end.join(" ")
|
37
|
+
end
|
32
38
|
end
|
33
39
|
else
|
34
40
|
# just put all matches in accumulator as Nokogiri::XML::Node's
|
@@ -21,6 +21,9 @@ module Traject
|
|
21
21
|
# If you need to use namespaces here, you need to have them registered with
|
22
22
|
# `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
|
23
23
|
# to use them in your each_record_xpath.
|
24
|
+
# * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
|
25
|
+
# mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
|
26
|
+
# of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
|
24
27
|
# * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
|
25
28
|
#
|
26
29
|
# ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
|
@@ -87,7 +90,11 @@ module Traject
|
|
87
90
|
end
|
88
91
|
|
89
92
|
def each
|
90
|
-
|
93
|
+
config_proc = if settings["nokogiri.strict_mode"]
|
94
|
+
proc { |config| config.strict }
|
95
|
+
end
|
96
|
+
|
97
|
+
whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
|
91
98
|
|
92
99
|
if each_record_xpath
|
93
100
|
whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
|
@@ -118,35 +125,26 @@ module Traject
|
|
118
125
|
private
|
119
126
|
|
120
127
|
|
121
|
-
#
|
128
|
+
# We simply do `new_parent_doc.root = node`
|
122
129
|
# It seemed maybe safer to dup the node as well as remove the original from the original doc,
|
123
130
|
# but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
|
124
131
|
# their doc is. I am hoping this pattern results in less memory usage.
|
125
132
|
# https://github.com/sparklemotion/nokogiri/issues/1703
|
126
133
|
#
|
127
|
-
#
|
128
|
-
# when re-parenting a node.
|
134
|
+
# We used to have to do something different in Jruby to work around bug:
|
129
135
|
# https://github.com/sparklemotion/nokogiri/issues/1774
|
130
136
|
#
|
131
|
-
#
|
132
|
-
#
|
133
|
-
#
|
134
|
-
#
|
137
|
+
# But as of nokogiri 1.9, that does not work, and is not necessary if we accept
|
138
|
+
# that Jruby nokogiri may put xmlns declerations on different elements than MRI,
|
139
|
+
# although it should be semantically equivalent for a namespace-aware parser.
|
140
|
+
# https://github.com/sparklemotion/nokogiri/issues/1875
|
141
|
+
#
|
142
|
+
# This as a separate method now exists largely as a historical artifact, and for this
|
143
|
+
# documentation.
|
135
144
|
def reparent_node_to_root(new_parent_doc, node)
|
136
|
-
if Traject::Util.is_jruby?
|
137
|
-
original_ns_scopes = node.namespace_scopes
|
138
|
-
end
|
139
145
|
|
140
146
|
new_parent_doc.root = node
|
141
147
|
|
142
|
-
if Traject::Util.is_jruby?
|
143
|
-
original_ns_scopes.each do |ns|
|
144
|
-
if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
|
145
|
-
new_parent_doc.root.add_namespace(ns.prefix, ns.href)
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
148
|
return new_parent_doc
|
151
149
|
end
|
152
150
|
|
@@ -115,9 +115,15 @@ module Traject
|
|
115
115
|
# @returns [HTTP::Client] from http.rb gem
|
116
116
|
def http_client
|
117
117
|
@http_client ||= begin
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
client = nil
|
119
|
+
|
120
|
+
if HTTP::VERSION.split(".").first.to_i > 3
|
121
|
+
client = HTTP.timeout(timeout)
|
122
|
+
else
|
123
|
+
# timeout setting on http.rb 3.x are a bit of a mess.
|
124
|
+
# https://github.com/httprb/http/issues/488
|
125
|
+
client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
|
126
|
+
end
|
121
127
|
|
122
128
|
if settings["oai_pmh.try_gzip"]
|
123
129
|
client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
|
@@ -16,7 +16,30 @@ require 'concurrent' # for atomic_fixnum
|
|
16
16
|
# This should work under both MRI and JRuby, with JRuby getting much
|
17
17
|
# better performance due to the threading model.
|
18
18
|
#
|
19
|
-
#
|
19
|
+
# Solr updates are by default sent with no commit params. This will definitely
|
20
|
+
# maximize your performance, and *especially* for bulk/batch indexing is recommended --
|
21
|
+
# use Solr auto commit in your Solr configuration instead, possibly with `commit_on_close`
|
22
|
+
# setting here.
|
23
|
+
#
|
24
|
+
# However, if you want the writer to send `commitWithin=true`, `commit=true`,
|
25
|
+
# `softCommit=true`, or any other URL parameters valid for Solr update handlers,
|
26
|
+
# you can configure this with `solr_writer.solr_update_args` setting. See:
|
27
|
+
# https://lucene.apache.org/solr/guide/7_0/near-real-time-searching.html#passing-commit-and-commitwithin-parameters-as-part-of-the-url
|
28
|
+
# Eg:
|
29
|
+
#
|
30
|
+
# settings do
|
31
|
+
# provide "solr_writer.solr_update_args", { commitWithin: 1000 }
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# (That it's a hash makes it infeasible to set/override on command line, if this is
|
35
|
+
# annoying for you let us know)
|
36
|
+
#
|
37
|
+
# `solr_update_args` will apply to batch and individual update requests, but
|
38
|
+
# not to commit sent if `commit_on_close`. You can also instead set
|
39
|
+
# `solr_writer.solr_commit_args` for that (or pass in an arg to #commit if calling
|
40
|
+
# manually)
|
41
|
+
#
|
42
|
+
# ## Relevant settings
|
20
43
|
#
|
21
44
|
# * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
|
22
45
|
#
|
@@ -35,19 +58,32 @@ require 'concurrent' # for atomic_fixnum
|
|
35
58
|
#
|
36
59
|
# * solr_writer.skippable_exceptions: List of classes that will be rescued internal to
|
37
60
|
# SolrJsonWriter, and handled with max_skipped logic. Defaults to
|
38
|
-
# `[HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED]`
|
61
|
+
# `[HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED, Traject::SolrJsonWriter::BadHttpResponse]`
|
62
|
+
#
|
63
|
+
# * solr_writer.solr_update_args: A _hash_ of query params to send to solr update url.
|
64
|
+
# Will be sent with every update request. Eg `{ softCommit: true }` or `{ commitWithin: 1000 }`.
|
65
|
+
# See also `solr_writer.solr_commit_args`
|
39
66
|
#
|
40
67
|
# * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
|
41
68
|
# end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
|
42
69
|
# compat only.)
|
43
70
|
#
|
71
|
+
# * solr_writer.commit_solr_update_args: A hash of query params to send when committing.
|
72
|
+
# Will be used for automatic `close_on_commit`, as well as any manual calls to #commit.
|
73
|
+
# If set, must include {"commit" => "true"} or { "softCommit" => "true" } if you actually
|
74
|
+
# want commits to happen when SolrJsonWriter tries to commit! But can be used to switch to softCommits
|
75
|
+
# (hard commits default), or specify additional params like optimize etc.
|
76
|
+
#
|
77
|
+
# * solr_writer.http_timeout: Value in seconds, will be set on the httpclient as connect/receive/send
|
78
|
+
# timeout. No way to set them individually at present. Default nil, use HTTPClient defaults
|
79
|
+
# (60 for connect/recieve, 120 for send).
|
80
|
+
#
|
44
81
|
# * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
|
45
|
-
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
82
|
+
# giving up as a timeout (http client receive_timeout). Default 10 minutes. Solr can be slow at commits. Overrides solr_writer.timeout
|
46
83
|
#
|
47
84
|
# * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
|
48
85
|
# or mock object to be used for HTTP.
|
49
|
-
|
50
|
-
|
86
|
+
#
|
51
87
|
class Traject::SolrJsonWriter
|
52
88
|
include Traject::QualifiedConstGet
|
53
89
|
|
@@ -71,7 +107,21 @@ class Traject::SolrJsonWriter
|
|
71
107
|
@max_skipped = nil
|
72
108
|
end
|
73
109
|
|
74
|
-
@http_client = @settings["solr_json_writer.http_client"]
|
110
|
+
@http_client = if @settings["solr_json_writer.http_client"]
|
111
|
+
@settings["solr_json_writer.http_client"]
|
112
|
+
else
|
113
|
+
client = HTTPClient.new
|
114
|
+
if @settings["solr_writer.http_timeout"]
|
115
|
+
client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
|
116
|
+
end
|
117
|
+
|
118
|
+
if @settings["solr_writer.basic_auth_user"] &&
|
119
|
+
@settings["solr_writer.basic_auth_password"]
|
120
|
+
client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
|
121
|
+
end
|
122
|
+
|
123
|
+
client
|
124
|
+
end
|
75
125
|
|
76
126
|
@batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
|
77
127
|
@batch_size = 1 if @batch_size < 1
|
@@ -96,6 +146,9 @@ class Traject::SolrJsonWriter
|
|
96
146
|
# Figure out where to send updates
|
97
147
|
@solr_update_url = self.determine_solr_update_url
|
98
148
|
|
149
|
+
@solr_update_args = settings["solr_writer.solr_update_args"]
|
150
|
+
@commit_solr_update_args = settings["solr_writer.commit_solr_update_args"]
|
151
|
+
|
99
152
|
logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
|
100
153
|
end
|
101
154
|
|
@@ -123,14 +176,28 @@ class Traject::SolrJsonWriter
|
|
123
176
|
send_batch( Traject::Util.drain_queue(@batched_queue) )
|
124
177
|
end
|
125
178
|
|
179
|
+
# configured update url, with either settings @solr_update_args or passed in
|
180
|
+
# query_params added to it
|
181
|
+
def solr_update_url_with_query(query_params)
|
182
|
+
if query_params
|
183
|
+
@solr_update_url + '?' + URI.encode_www_form(query_params)
|
184
|
+
else
|
185
|
+
@solr_update_url
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
126
189
|
# Send the given batch of contexts. If something goes wrong, send
|
127
190
|
# them one at a time.
|
128
191
|
# @param [Array<Traject::Indexer::Context>] an array of contexts
|
129
192
|
def send_batch(batch)
|
130
193
|
return if batch.empty?
|
194
|
+
|
195
|
+
logger.debug("#{self.class.name}: sending batch of #{batch.size} to Solr")
|
196
|
+
|
131
197
|
json_package = JSON.generate(batch.map { |c| c.output_hash })
|
198
|
+
|
132
199
|
begin
|
133
|
-
resp = @http_client.post @
|
200
|
+
resp = @http_client.post solr_update_url_with_query(@solr_update_args), json_package, "Content-type" => "application/json"
|
134
201
|
rescue StandardError => exception
|
135
202
|
end
|
136
203
|
|
@@ -151,34 +218,71 @@ class Traject::SolrJsonWriter
|
|
151
218
|
# Send a single context to Solr, logging an error if need be
|
152
219
|
# @param [Traject::Indexer::Context] c The context whose document you want to send
|
153
220
|
def send_single(c)
|
221
|
+
logger.debug("#{self.class.name}: sending single record to Solr: #{c.output_hash}")
|
222
|
+
|
154
223
|
json_package = JSON.generate([c.output_hash])
|
155
224
|
begin
|
156
|
-
|
157
|
-
|
158
|
-
# allow unexpected errors to propagate up.
|
159
|
-
rescue *skippable_exceptions => exception
|
160
|
-
# no body, local variable exception set above will be used below
|
161
|
-
end
|
225
|
+
post_url = solr_update_url_with_query(@solr_update_args)
|
226
|
+
resp = @http_client.post post_url, json_package, "Content-type" => "application/json"
|
162
227
|
|
163
|
-
|
164
|
-
|
165
|
-
|
228
|
+
unless resp.status == 200
|
229
|
+
raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status} from POST #{post_url}", resp)
|
230
|
+
end
|
231
|
+
|
232
|
+
# Catch Timeouts and network errors -- as well as non-200 http responses --
|
233
|
+
# as skipped records, but otherwise allow unexpected errors to propagate up.
|
234
|
+
rescue *skippable_exceptions => exception
|
235
|
+
msg = if exception.kind_of?(BadHttpResponse)
|
236
|
+
"Solr error response: #{exception.response.status}: #{exception.response.body}"
|
166
237
|
else
|
167
|
-
|
238
|
+
Traject::Util.exception_to_log_message(exception)
|
168
239
|
end
|
240
|
+
|
169
241
|
logger.error "Could not add record #{c.record_inspect}: #{msg}"
|
170
242
|
logger.debug("\t" + exception.backtrace.join("\n\t")) if exception
|
171
243
|
logger.debug(c.source_record.to_s) if c.source_record
|
172
244
|
|
173
245
|
@skipped_record_incrementer.increment
|
174
246
|
if @max_skipped and skipped_record_count > @max_skipped
|
175
|
-
|
247
|
+
# re-raising in rescue means the last encountered error will be available as #cause
|
248
|
+
# on raised exception, a feature in ruby 2.1+.
|
249
|
+
raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting: #{exception.message}")
|
176
250
|
end
|
177
|
-
|
178
251
|
end
|
252
|
+
end
|
253
|
+
|
179
254
|
|
255
|
+
# Very beginning of a delete implementation. POSTs a delete request to solr
|
256
|
+
# for id in arg (value of Solr UniqueID field, usually `id` field).
|
257
|
+
#
|
258
|
+
# Right now, does it inline and immediately, no use of background threads or batching.
|
259
|
+
# This could change.
|
260
|
+
#
|
261
|
+
# Right now, if unsuccesful for any reason, will raise immediately out of here.
|
262
|
+
# Could raise any of the `skippable_exceptions` (timeouts, network errors), an
|
263
|
+
# exception will be raised right out of here.
|
264
|
+
#
|
265
|
+
# Will use `solr_writer.solr_update_args` settings.
|
266
|
+
#
|
267
|
+
# There is no built-in way to direct a record to be deleted from an indexing config
|
268
|
+
# file at the moment, this is just a loose method on the writer.
|
269
|
+
def delete(id)
|
270
|
+
logger.debug("#{self.class.name}: Sending delete to Solr for #{id}")
|
271
|
+
|
272
|
+
json_package = {delete: id}
|
273
|
+
resp = @http_client.post solr_update_url_with_query(@solr_update_args), JSON.generate(json_package), "Content-type" => "application/json"
|
274
|
+
if resp.status != 200
|
275
|
+
raise RuntimeError.new("Could not delete #{id.inspect}, http response #{resp.status}: #{resp.body}")
|
276
|
+
end
|
180
277
|
end
|
181
278
|
|
279
|
+
# Send a delete all query.
|
280
|
+
#
|
281
|
+
# This method takes no params and will not automatically commit the deletes.
|
282
|
+
# @example @writer.delete_all!
|
283
|
+
def delete_all!
|
284
|
+
delete(query: "*:*")
|
285
|
+
end
|
182
286
|
|
183
287
|
# Get the logger from the settings, or default to an effectively null logger
|
184
288
|
def logger
|
@@ -199,14 +303,16 @@ class Traject::SolrJsonWriter
|
|
199
303
|
@thread_pool.maybe_in_thread_pool { send_batch(batch) }
|
200
304
|
end
|
201
305
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
306
|
+
if @thread_pool_size && @thread_pool_size > 0
|
307
|
+
# Wait for shutdown, and time it.
|
308
|
+
logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
|
309
|
+
elapsed = @thread_pool.shutdown_and_wait
|
310
|
+
if elapsed > 60
|
311
|
+
logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
|
312
|
+
end
|
313
|
+
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
314
|
+
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
207
315
|
end
|
208
|
-
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
209
|
-
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
210
316
|
|
211
317
|
# check again now that we've waited, there could still be some
|
212
318
|
# that didn't show up before.
|
@@ -220,14 +326,32 @@ class Traject::SolrJsonWriter
|
|
220
326
|
|
221
327
|
|
222
328
|
# Send a commit
|
223
|
-
|
329
|
+
#
|
330
|
+
# Called automatially by `close_on_commit` setting, but also can be called manually.
|
331
|
+
#
|
332
|
+
# If settings `solr_writer.commit_solr_update_args` is set, will be used by default.
|
333
|
+
# That setting needs `{ commit: true }` or `{softCommit: true}` if you want it to
|
334
|
+
# actually do a commit!
|
335
|
+
#
|
336
|
+
# Optional query_params argument is the actual args to send, you must be sure
|
337
|
+
# to make it include "commit: true" or "softCommit: true" for it to actually commit!
|
338
|
+
# But you may want to include other params too, like optimize etc. query_param
|
339
|
+
# argument replaces setting `solr_writer.commit_solr_update_args`, they are not merged.
|
340
|
+
#
|
341
|
+
# @param [Hash] query_params optional query params to send to solr update. Default {"commit" => "true"}
|
342
|
+
#
|
343
|
+
# @example @writer.commit
|
344
|
+
# @example @writer.commit(softCommit: true)
|
345
|
+
# @example @writer.commit(commit: true, optimize: true, waitFlush: false)
|
346
|
+
def commit(query_params = nil)
|
347
|
+
query_params ||= @commit_solr_update_args || {"commit" => "true"}
|
224
348
|
logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
|
225
349
|
|
226
350
|
original_timeout = @http_client.receive_timeout
|
227
351
|
|
228
352
|
@http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
|
229
353
|
|
230
|
-
resp = @http_client.get(
|
354
|
+
resp = @http_client.get(solr_update_url_with_query(query_params))
|
231
355
|
unless resp.status == 200
|
232
356
|
raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
|
233
357
|
end
|
@@ -279,10 +403,24 @@ class Traject::SolrJsonWriter
|
|
279
403
|
|
280
404
|
class MaxSkippedRecordsExceeded < RuntimeError ; end
|
281
405
|
|
406
|
+
# Adapted from HTTPClient::BadResponseError.
|
407
|
+
# It's got a #response accessor that will give you the HTTPClient
|
408
|
+
# Response object that had a bad status, although relying on that
|
409
|
+
# would tie you to our HTTPClient implementation that maybe should
|
410
|
+
# be considered an implementation detail, so I dunno.
|
411
|
+
class BadHttpResponse < RuntimeError
|
412
|
+
# HTTP::Message:: a response
|
413
|
+
attr_reader :response
|
414
|
+
|
415
|
+
def initialize(msg, response = nil) # :nodoc:
|
416
|
+
super(msg)
|
417
|
+
@response = response
|
418
|
+
end
|
419
|
+
end
|
282
420
|
|
283
421
|
private
|
284
422
|
|
285
423
|
def skippable_exceptions
|
286
|
-
@skippable_exceptions ||= (settings["solr_writer.skippable_exceptions"] || [HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED])
|
424
|
+
@skippable_exceptions ||= (settings["solr_writer.skippable_exceptions"] || [HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED, Traject::SolrJsonWriter::BadHttpResponse])
|
287
425
|
end
|
288
426
|
end
|