traject_horizon 0.9.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -109,6 +109,14 @@ module Traject
109
109
  # == Misc
110
110
  #
111
111
  # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
112
+ #
113
+ # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
114
+ # set to >0, the work of secondary DB queries for item/copy info and the subsequent
115
+ # yielding to caller will be done in threads. Playing with this number
116
+ # may be able to increase performance -- recommend not setting it to 0
117
+ # except maybe for debugging, having it at least 1 significantly improves
118
+ # throughput.
119
+ #
112
120
  # [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
113
121
  # subsidiary fetch. See description of this setting in docs/settings.md
114
122
  #
@@ -136,6 +144,8 @@ module Traject
136
144
  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
137
145
 
138
146
  require_jars!
147
+
148
+ @thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
139
149
  end
140
150
 
141
151
  # Requires marc4j and jtds, and java_import's some classes.
@@ -288,15 +298,22 @@ module Traject
288
298
  # new record! Put old one on batch queue.
289
299
  record_batch << record if record
290
300
 
301
+ # Any exceptions from background threads?
302
+ @thread_pool.raise_collected_exception!
303
+
291
304
  # prepare and yield batch?
292
305
  if (record_count % batch_size == 0)
293
- enhance_batch!(extra_connection, record_batch)
294
- record_batch.each do |r|
295
- # set current_bib_id for error logging
296
- current_bib_id = r['001'].value
297
- yield r
306
+ batch = record_batch
307
+ record_batch = []
308
+
309
+ @thread_pool.maybe_in_thread_pool(batch) do |batch|
310
+ enhance_batch!(extra_connection, batch)
311
+ batch.each do |r|
312
+ # set current_bib_id for error logging
313
+ current_bib_id = r['001'].value
314
+ yield r
315
+ end
298
316
  end
299
- record_batch.clear
300
317
  end
301
318
 
302
319
  # And start new record we've encountered.
@@ -372,6 +389,10 @@ module Traject
372
389
  # last one
373
390
  record_batch << record if record
374
391
 
392
+ logger.debug "HorizonReader: Waiting for threadpool work complete..."
393
+ @thread_pool.shutdown_and_wait
394
+ logger.debug "HorizonReader: threadpool work complete."
395
+
375
396
  # yield last batch
376
397
  enhance_batch!(extra_connection, record_batch)
377
398
  record_batch.each do |r|
@@ -381,6 +402,8 @@ module Traject
381
402
  end
382
403
  record_batch.clear
383
404
 
405
+
406
+
384
407
  rescue Exception => e
385
408
  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
386
409
  raise e
@@ -516,6 +539,9 @@ module Traject
516
539
  # It might be higher performance to refactor to re-use the same prepared statement
517
540
  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
518
541
  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
542
+ #
543
+ # Also, we need to be thread-safe now, so maybe better to create one
544
+ # each time, rather than risk sharing between threads accidentally.
519
545
  pstmt = conn.prepareStatement(sql);
520
546
  rs = pstmt.executeQuery
521
547
 
@@ -598,6 +624,7 @@ module Traject
598
624
  def self.default_settings
599
625
  {
600
626
  "horizon.batch_size" => 400,
627
+ "horizon.thread_pool" => 2,
601
628
 
602
629
  "horizon.public_only" => true,
603
630
 
@@ -1,3 +1,3 @@
1
1
  module TrajectHorizon
2
- VERSION = "0.9.1"
2
+ VERSION = "0.10.0"
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject_horizon
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.9.1
5
+ version: 0.10.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-05 00:00:00.000000000 Z
12
+ date: 2013-09-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: traject