traject 3.1.0.rc1 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGES.md +4 -0
- data/README.md +2 -0
- data/lib/traject/solr_json_writer.rb +20 -10
- data/lib/traject/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bc0afc820efb8a2479d96913ae67552556978d264e20a55d0a4d4cef5ff9a2a
|
4
|
+
data.tar.gz: 773529d885d46d2ba0afd323d680724678225bd3616a720dba92809479395dce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 395f1a3cf62cfd0cccbdcd428830feb098586c9ca1edc7adecff31483545a62dbc06631e7c166ea0ce1b229afeddcec7d994245d78871f1a6644c591d6cc432c
|
7
|
+
data.tar.gz: 647f9cfbf0a7a0876cef29b5a90a9fd6d3dd1f6a1010b332d58e132875b5c4a7f8be9bdab1afabb5ec0bc583af9ea0d8a422cc87eec6c04d93c9fe9f4b4d1e9a
|
data/CHANGES.md
CHANGED
@@ -24,6 +24,10 @@
|
|
24
24
|
|
25
25
|
* SolrJsonWriter now respects a `solr_writer.http_timeout` setting, in seconds, to be passed to HTTPClient instance. https://github.com/traject/traject/pull/219
|
26
26
|
|
27
|
+
* Only runs thread pool shutdown code (and logging) if there is a `solr_writer.batch_size` greater than 0. Keep it out of the logs if it was a no-op anyway.
|
28
|
+
|
29
|
+
* Logs at DEBUG level every time it sends an update request to solr
|
30
|
+
|
27
31
|
* Nokogiri dependency for the NokogiriReader increased to `~> 1.9`. When using Jruby `each_record_xpath`, resulting yielded documents may have xmlns declarations on different nodes than in MRI (and previous versions of nokogiri), but we could find now way around this with nokogiri >= 1.9.0. The documents should still be semantically equivalent for namespace use. This was necessary to keep JRuby Nokogiri XML working with recent Nokogiri releases. https://github.com/traject/traject/pull/209
|
28
32
|
|
29
33
|
* LineWriter guesses better about when to auto-close, and provides an optional explicit setting in case it guesses wrong. (thanks @justinlittman) https://github.com/traject/traject/pull/211
|
data/README.md
CHANGED
@@ -175,6 +175,8 @@ TranslationMap use above is just one example of a transformation macro, that tra
|
|
175
175
|
* `append("--after each value")`
|
176
176
|
* `gsub(/regex/, "replacement")`
|
177
177
|
* `split(" ")`: take values and split them, possibly result in multiple values.
|
178
|
+
* `transform(proc)`: transform each existing macro using a proc, kind of like `map`.
|
179
|
+
eg `to_field "something", extract_xml("//author"), transform( ->(author) { "#{author.last}, #{author.first}" })
|
178
180
|
|
179
181
|
You can add on as many transformation macros as you want, they will be applied to output in order.
|
180
182
|
|
@@ -185,6 +185,9 @@ class Traject::SolrJsonWriter
|
|
185
185
|
# @param [Array<Traject::Indexer::Context>] an array of contexts
|
186
186
|
def send_batch(batch)
|
187
187
|
return if batch.empty?
|
188
|
+
|
189
|
+
logger.debug("#{self.class.name}: sending batch of #{batch.size} to Solr")
|
190
|
+
|
188
191
|
json_package = JSON.generate(batch.map { |c| c.output_hash })
|
189
192
|
|
190
193
|
begin
|
@@ -209,12 +212,15 @@ class Traject::SolrJsonWriter
|
|
209
212
|
# Send a single context to Solr, logging an error if need be
|
210
213
|
# @param [Traject::Indexer::Context] c The context whose document you want to send
|
211
214
|
def send_single(c)
|
215
|
+
logger.debug("#{self.class.name}: sending single record to Solr: #{c.output_hash}")
|
216
|
+
|
212
217
|
json_package = JSON.generate([c.output_hash])
|
213
218
|
begin
|
214
|
-
|
219
|
+
post_url = solr_update_url_with_query(@solr_update_args)
|
220
|
+
resp = @http_client.post post_url, json_package, "Content-type" => "application/json"
|
215
221
|
|
216
222
|
unless resp.status == 200
|
217
|
-
raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status}", resp)
|
223
|
+
raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status} from POST #{post_url}", resp)
|
218
224
|
end
|
219
225
|
|
220
226
|
# Catch Timeouts and network errors -- as well as non-200 http responses --
|
@@ -234,7 +240,7 @@ class Traject::SolrJsonWriter
|
|
234
240
|
if @max_skipped and skipped_record_count > @max_skipped
|
235
241
|
# re-raising in rescue means the last encountered error will be available as #cause
|
236
242
|
# on raised exception, a feature in ruby 2.1+.
|
237
|
-
raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
|
243
|
+
raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting: #{exception.message}")
|
238
244
|
end
|
239
245
|
end
|
240
246
|
end
|
@@ -255,6 +261,8 @@ class Traject::SolrJsonWriter
|
|
255
261
|
# There is no built-in way to direct a record to be deleted from an indexing config
|
256
262
|
# file at the moment, this is just a loose method on the writer.
|
257
263
|
def delete(id)
|
264
|
+
logger.debug("#{self.class.name}: Sending delete to Solr for #{id}")
|
265
|
+
|
258
266
|
json_package = {delete: id}
|
259
267
|
resp = @http_client.post solr_update_url_with_query(@solr_update_args), JSON.generate(json_package), "Content-type" => "application/json"
|
260
268
|
if resp.status != 200
|
@@ -282,14 +290,16 @@ class Traject::SolrJsonWriter
|
|
282
290
|
@thread_pool.maybe_in_thread_pool { send_batch(batch) }
|
283
291
|
end
|
284
292
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
293
|
+
if @thread_pool_size && @thread_pool_size > 0
|
294
|
+
# Wait for shutdown, and time it.
|
295
|
+
logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
|
296
|
+
elapsed = @thread_pool.shutdown_and_wait
|
297
|
+
if elapsed > 60
|
298
|
+
logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
|
299
|
+
end
|
300
|
+
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
301
|
+
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
290
302
|
end
|
291
|
-
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
292
|
-
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
293
303
|
|
294
304
|
# check again now that we've waited, there could still be some
|
295
305
|
# that didn't show up before.
|
data/lib/traject/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.0
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-04-
|
12
|
+
date: 2019-04-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -388,9 +388,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
388
388
|
version: '0'
|
389
389
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
390
390
|
requirements:
|
391
|
-
- - "
|
391
|
+
- - ">="
|
392
392
|
- !ruby/object:Gem::Version
|
393
|
-
version:
|
393
|
+
version: '0'
|
394
394
|
requirements: []
|
395
395
|
rubyforge_project:
|
396
396
|
rubygems_version: 2.7.6
|