RubyGems - traject_horizon - Versions diffs - 0.9.1 → 0.10.0 - Mend

traject_horizon 0.9.1 → 0.10.0

Files changed (3) hide show

data/lib/traject/horizon_reader.rb +33 -6
data/lib/traject_horizon/version.rb +1 -1
metadata +2 -2

data/lib/traject/horizon_reader.rb CHANGED Viewed

@@ -109,6 +109,14 @@ module Traject
   # == Misc
   #
   # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
+  #
+  # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
+  #    set to >0, the work of secondary DB queries for item/copy info and the subsequent
+  #    yielding to caller will be done in threads. Playing with this number
+  #    may be able to increase performance -- recommend not setting it to 0
+  #    except maybe for debugging, having it at least 1 significantly improves
+  #    throughput.
+  #
   # [debug_ascii_progress]  if true, will output a "<" and a ">" to stderr around every copy/item
   #           subsidiary fetch. See description of this setting in docs/settings.md
   #
@@ -136,6 +144,8 @@ module Traject
       @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
       require_jars!
+      @thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
     end
     # Requires marc4j and jtds, and java_import's some classes.
@@ -288,15 +298,22 @@ module Traject
           # new record! Put old one on batch queue.
           record_batch << record if record
+          # Any exceptions from background threads?
+          @thread_pool.raise_collected_exception!
           # prepare and yield batch?
           if (record_count % batch_size == 0)
-            enhance_batch!(extra_connection, record_batch)
-            record_batch.each do |r|
-              # set current_bib_id for error logging
-              current_bib_id = r['001'].value
-              yield r
+            batch = record_batch
+            record_batch = []
+            @thread_pool.maybe_in_thread_pool(batch) do |batch|
+              enhance_batch!(extra_connection, batch)
+              batch.each do |r|
+                # set current_bib_id for error logging
+                current_bib_id = r['001'].value
+                yield r
+              end
             end
-            record_batch.clear
           end
           # And start new record we've encountered.
@@ -372,6 +389,10 @@ module Traject
       # last one
       record_batch << record if record
+      logger.debug "HorizonReader: Waiting for threadpool work complete..."
+      @thread_pool.shutdown_and_wait
+      logger.debug "HorizonReader: threadpool work complete."
       # yield last batch
       enhance_batch!(extra_connection, record_batch)
       record_batch.each do |r|
@@ -381,6 +402,8 @@ module Traject
       end
       record_batch.clear
     rescue Exception => e
       logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
       raise e
@@ -516,6 +539,9 @@ module Traject
       # It might be higher performance to refactor to re-use the same prepared statement
       # for each item/copy fetch... but appears to be no great way to do that in JDBC3
       # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
+      #
+      # Also, we need to be thread-safe now, so maybe better to create one
+      # each time, rather than risk sharing between threads accidentally.
       pstmt = conn.prepareStatement(sql);
       rs = pstmt.executeQuery
@@ -598,6 +624,7 @@ module Traject
     def self.default_settings
       {
         "horizon.batch_size" => 400,
+        "horizon.thread_pool" => 2,
         "horizon.public_only" => true,

data/lib/traject_horizon/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TrajectHorizon
-  VERSION = "0.9.1"
+  VERSION = "0.10.0"
 end

metadata CHANGED Viewed

@@ -2,14 +2,14 @@
 name: traject_horizon
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.9.1
+  version: 0.10.0
 platform: ruby
 authors:
 - Jonathan Rochkind
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-09-05 00:00:00.000000000 Z
+date: 2013-09-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: traject