traject 3.1.0.rc1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 06c28d37f9aafafe709a146c7612e5b5d8a5c58a61fd1502823a38dc52b9d05b
4
- data.tar.gz: 2e38b2b8c4030456f3757ae6062231268110d68ef07e10cab722b4074ccd570c
3
+ metadata.gz: 7bc0afc820efb8a2479d96913ae67552556978d264e20a55d0a4d4cef5ff9a2a
4
+ data.tar.gz: 773529d885d46d2ba0afd323d680724678225bd3616a720dba92809479395dce
5
5
  SHA512:
6
- metadata.gz: 04561a77a3e6f2073198983b5bf7d4e35cc9f52bccc1211487cc4c850b0f0b0fc9395a7c87e6ed90061f4a15af57516434d260c649fbc43ea65a0c6435194818
7
- data.tar.gz: c7312156c3be556218e319e35ae76aa97fbae5fad6720dbce2e4a046ec90603f5de34fe2cb055425fb3da499922fba50c7d4a6445858793bb0a4fb26cf8f7b29
6
+ metadata.gz: 395f1a3cf62cfd0cccbdcd428830feb098586c9ca1edc7adecff31483545a62dbc06631e7c166ea0ce1b229afeddcec7d994245d78871f1a6644c591d6cc432c
7
+ data.tar.gz: 647f9cfbf0a7a0876cef29b5a90a9fd6d3dd1f6a1010b332d58e132875b5c4a7f8be9bdab1afabb5ec0bc583af9ea0d8a422cc87eec6c04d93c9fe9f4b4d1e9a
data/CHANGES.md CHANGED
@@ -24,6 +24,10 @@
24
24
 
25
25
  * SolrJsonWriter now respects a `solr_writer.http_timeout` setting, in seconds, to be passed to HTTPClient instance. https://github.com/traject/traject/pull/219
26
26
 
27
+ * Only runs thread pool shutdown code (and logging) if there is a `solr_writer.batch_size` greater than 0. Keep it out of the logs if it was a no-op anyway.
28
+
29
+ * Logs at DEBUG level every time it sends an update request to solr
30
+
27
31
  * Nokogiri dependency for the NokogiriReader increased to `~> 1.9`. When using Jruby `each_record_xpath`, resulting yielded documents may have xmlns declarations on different nodes than in MRI (and previous versions of nokogiri), but we could find now way around this with nokogiri >= 1.9.0. The documents should still be semantically equivalent for namespace use. This was necessary to keep JRuby Nokogiri XML working with recent Nokogiri releases. https://github.com/traject/traject/pull/209
28
32
 
29
33
  * LineWriter guesses better about when to auto-close, and provides an optional explicit setting in case it guesses wrong. (thanks @justinlittman) https://github.com/traject/traject/pull/211
data/README.md CHANGED
@@ -175,6 +175,8 @@ TranslationMap use above is just one example of a transformation macro, that tra
175
175
  * `append("--after each value")`
176
176
  * `gsub(/regex/, "replacement")`
177
177
  * `split(" ")`: take values and split them, possibly result in multiple values.
178
+ * `transform(proc)`: transform each existing macro using a proc, kind of like `map`.
179
+ eg `to_field "something", extract_xml("//author"), transform( ->(author) { "#{author.last}, #{author.first}" })
178
180
 
179
181
  You can add on as many transformation macros as you want, they will be applied to output in order.
180
182
 
@@ -185,6 +185,9 @@ class Traject::SolrJsonWriter
185
185
  # @param [Array<Traject::Indexer::Context>] an array of contexts
186
186
  def send_batch(batch)
187
187
  return if batch.empty?
188
+
189
+ logger.debug("#{self.class.name}: sending batch of #{batch.size} to Solr")
190
+
188
191
  json_package = JSON.generate(batch.map { |c| c.output_hash })
189
192
 
190
193
  begin
@@ -209,12 +212,15 @@ class Traject::SolrJsonWriter
209
212
  # Send a single context to Solr, logging an error if need be
210
213
  # @param [Traject::Indexer::Context] c The context whose document you want to send
211
214
  def send_single(c)
215
+ logger.debug("#{self.class.name}: sending single record to Solr: #{c.output_hash}")
216
+
212
217
  json_package = JSON.generate([c.output_hash])
213
218
  begin
214
- resp = @http_client.post solr_update_url_with_query(@solr_update_args), json_package, "Content-type" => "application/json"
219
+ post_url = solr_update_url_with_query(@solr_update_args)
220
+ resp = @http_client.post post_url, json_package, "Content-type" => "application/json"
215
221
 
216
222
  unless resp.status == 200
217
- raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status}", resp)
223
+ raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status} from POST #{post_url}", resp)
218
224
  end
219
225
 
220
226
  # Catch Timeouts and network errors -- as well as non-200 http responses --
@@ -234,7 +240,7 @@ class Traject::SolrJsonWriter
234
240
  if @max_skipped and skipped_record_count > @max_skipped
235
241
  # re-raising in rescue means the last encountered error will be available as #cause
236
242
  # on raised exception, a feature in ruby 2.1+.
237
- raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
243
+ raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting: #{exception.message}")
238
244
  end
239
245
  end
240
246
  end
@@ -255,6 +261,8 @@ class Traject::SolrJsonWriter
255
261
  # There is no built-in way to direct a record to be deleted from an indexing config
256
262
  # file at the moment, this is just a loose method on the writer.
257
263
  def delete(id)
264
+ logger.debug("#{self.class.name}: Sending delete to Solr for #{id}")
265
+
258
266
  json_package = {delete: id}
259
267
  resp = @http_client.post solr_update_url_with_query(@solr_update_args), JSON.generate(json_package), "Content-type" => "application/json"
260
268
  if resp.status != 200
@@ -282,14 +290,16 @@ class Traject::SolrJsonWriter
282
290
  @thread_pool.maybe_in_thread_pool { send_batch(batch) }
283
291
  end
284
292
 
285
- # Wait for shutdown, and time it.
286
- logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
287
- elapsed = @thread_pool.shutdown_and_wait
288
- if elapsed > 60
289
- logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
293
+ if @thread_pool_size && @thread_pool_size > 0
294
+ # Wait for shutdown, and time it.
295
+ logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
296
+ elapsed = @thread_pool.shutdown_and_wait
297
+ if elapsed > 60
298
+ logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
299
+ end
300
+ logger.debug "#{self.class.name}: Thread pool shutdown complete"
301
+ logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
290
302
  end
291
- logger.debug "#{self.class.name}: Thread pool shutdown complete"
292
- logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
293
303
 
294
304
  # check again now that we've waited, there could still be some
295
305
  # that didn't show up before.
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.1.0.rc1"
2
+ VERSION = "3.1.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0.rc1
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-04-10 00:00:00.000000000 Z
12
+ date: 2019-04-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -388,9 +388,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
388
388
  version: '0'
389
389
  required_rubygems_version: !ruby/object:Gem::Requirement
390
390
  requirements:
391
- - - ">"
391
+ - - ">="
392
392
  - !ruby/object:Gem::Version
393
- version: 1.3.1
393
+ version: '0'
394
394
  requirements: []
395
395
  rubyforge_project:
396
396
  rubygems_version: 2.7.6