traject_horizon 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,6 +109,14 @@ module Traject
109
109
  # == Misc
110
110
  #
111
111
  # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
112
+ #
113
+ # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
114
+ # set to >0, the work of secondary DB queries for item/copy info and the subsequent
115
+ # yielding to caller will be done in threads. Playing with this number
116
+ # may be able to increase performance -- recommend not setting it to 0
117
+ # except maybe for debugging, having it at least 1 significantly improves
118
+ # throughput.
119
+ #
112
120
  # [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
113
121
  # subsidiary fetch. See description of this setting in docs/settings.md
114
122
  #
@@ -136,6 +144,8 @@ module Traject
136
144
  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
137
145
 
138
146
  require_jars!
147
+
148
+ @thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
139
149
  end
140
150
 
141
151
  # Requires marc4j and jtds, and java_import's some classes.
@@ -288,15 +298,22 @@ module Traject
288
298
  # new record! Put old one on batch queue.
289
299
  record_batch << record if record
290
300
 
301
+ # Any exceptions from background threads?
302
+ @thread_pool.raise_collected_exception!
303
+
291
304
  # prepare and yield batch?
292
305
  if (record_count % batch_size == 0)
293
- enhance_batch!(extra_connection, record_batch)
294
- record_batch.each do |r|
295
- # set current_bib_id for error logging
296
- current_bib_id = r['001'].value
297
- yield r
306
+ batch = record_batch
307
+ record_batch = []
308
+
309
+ @thread_pool.maybe_in_thread_pool(batch) do |batch|
310
+ enhance_batch!(extra_connection, batch)
311
+ batch.each do |r|
312
+ # set current_bib_id for error logging
313
+ current_bib_id = r['001'].value
314
+ yield r
315
+ end
298
316
  end
299
- record_batch.clear
300
317
  end
301
318
 
302
319
  # And start new record we've encountered.
@@ -372,6 +389,10 @@ module Traject
372
389
  # last one
373
390
  record_batch << record if record
374
391
 
392
+ logger.debug "HorizonReader: Waiting for threadpool work complete..."
393
+ @thread_pool.shutdown_and_wait
394
+ logger.debug "HorizonReader: threadpool work complete."
395
+
375
396
  # yield last batch
376
397
  enhance_batch!(extra_connection, record_batch)
377
398
  record_batch.each do |r|
@@ -381,6 +402,8 @@ module Traject
381
402
  end
382
403
  record_batch.clear
383
404
 
405
+
406
+
384
407
  rescue Exception => e
385
408
  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
386
409
  raise e
@@ -516,6 +539,9 @@ module Traject
516
539
  # It might be higher performance to refactor to re-use the same prepared statement
517
540
  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
518
541
  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
542
+ #
543
+ # Also, we need to be thread-safe now, so maybe better to create one
544
+ # each time, rather than risk sharing between threads accidentally.
519
545
  pstmt = conn.prepareStatement(sql);
520
546
  rs = pstmt.executeQuery
521
547
 
@@ -598,6 +624,7 @@ module Traject
598
624
  def self.default_settings
599
625
  {
600
626
  "horizon.batch_size" => 400,
627
+ "horizon.thread_pool" => 2,
601
628
 
602
629
  "horizon.public_only" => true,
603
630
 
@@ -1,3 +1,3 @@
1
1
  module TrajectHorizon
2
- VERSION = "0.9.1"
2
+ VERSION = "0.10.0"
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject_horizon
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.9.1
5
+ version: 0.10.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-05 00:00:00.000000000 Z
12
+ date: 2013-09-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: traject