traject_horizon 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/traject/horizon_reader.rb +33 -6
 - data/lib/traject_horizon/version.rb +1 -1
 - metadata +2 -2
 
| 
         @@ -109,6 +109,14 @@ module Traject 
     | 
|
| 
       109 
109 
     | 
    
         
             
              # == Misc
         
     | 
| 
       110 
110 
     | 
    
         
             
              #
         
     | 
| 
       111 
111 
     | 
    
         
             
              # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
         
     | 
| 
      
 112 
     | 
    
         
            +
              #
         
     | 
| 
      
 113 
     | 
    
         
            +
              # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
         
     | 
| 
      
 114 
     | 
    
         
            +
              #    set to >0, the work of secondary DB queries for item/copy info and the subsequent
         
     | 
| 
      
 115 
     | 
    
         
            +
              #    yielding to caller will be done in threads. Playing with this number
         
     | 
| 
      
 116 
     | 
    
         
            +
              #    may be able to increase performance -- recommend not setting it to 0
         
     | 
| 
      
 117 
     | 
    
         
            +
              #    except maybe for debugging, having it at least 1 significantly improves
         
     | 
| 
      
 118 
     | 
    
         
            +
              #    throughput. 
         
     | 
| 
      
 119 
     | 
    
         
            +
              # 
         
     | 
| 
       112 
120 
     | 
    
         
             
              # [debug_ascii_progress]  if true, will output a "<" and a ">" to stderr around every copy/item
         
     | 
| 
       113 
121 
     | 
    
         
             
              #           subsidiary fetch. See description of this setting in docs/settings.md
         
     | 
| 
       114 
122 
     | 
    
         
             
              #
         
     | 
| 
         @@ -136,6 +144,8 @@ module Traject 
     | 
|
| 
       136 
144 
     | 
    
         
             
                  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
         
     | 
| 
       137 
145 
     | 
    
         | 
| 
       138 
146 
     | 
    
         
             
                  require_jars!
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
                  @thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
         
     | 
| 
       139 
149 
     | 
    
         
             
                end
         
     | 
| 
       140 
150 
     | 
    
         | 
| 
       141 
151 
     | 
    
         
             
                # Requires marc4j and jtds, and java_import's some classes.
         
     | 
| 
         @@ -288,15 +298,22 @@ module Traject 
     | 
|
| 
       288 
298 
     | 
    
         
             
                      # new record! Put old one on batch queue.
         
     | 
| 
       289 
299 
     | 
    
         
             
                      record_batch << record if record
         
     | 
| 
       290 
300 
     | 
    
         | 
| 
      
 301 
     | 
    
         
            +
                      # Any exceptions from background threads?
         
     | 
| 
      
 302 
     | 
    
         
            +
                      @thread_pool.raise_collected_exception!
         
     | 
| 
      
 303 
     | 
    
         
            +
             
     | 
| 
       291 
304 
     | 
    
         
             
                      # prepare and yield batch?
         
     | 
| 
       292 
305 
     | 
    
         
             
                      if (record_count % batch_size == 0)
         
     | 
| 
       293 
     | 
    
         
            -
                         
     | 
| 
       294 
     | 
    
         
            -
                        record_batch 
     | 
| 
       295 
     | 
    
         
            -
             
     | 
| 
       296 
     | 
    
         
            -
             
     | 
| 
       297 
     | 
    
         
            -
                           
     | 
| 
      
 306 
     | 
    
         
            +
                        batch = record_batch
         
     | 
| 
      
 307 
     | 
    
         
            +
                        record_batch = []
         
     | 
| 
      
 308 
     | 
    
         
            +
             
     | 
| 
      
 309 
     | 
    
         
            +
                        @thread_pool.maybe_in_thread_pool(batch) do |batch|
         
     | 
| 
      
 310 
     | 
    
         
            +
                          enhance_batch!(extra_connection, batch)
         
     | 
| 
      
 311 
     | 
    
         
            +
                          batch.each do |r|
         
     | 
| 
      
 312 
     | 
    
         
            +
                            # set current_bib_id for error logging
         
     | 
| 
      
 313 
     | 
    
         
            +
                            current_bib_id = r['001'].value
         
     | 
| 
      
 314 
     | 
    
         
            +
                            yield r
         
     | 
| 
      
 315 
     | 
    
         
            +
                          end
         
     | 
| 
       298 
316 
     | 
    
         
             
                        end
         
     | 
| 
       299 
     | 
    
         
            -
                        record_batch.clear
         
     | 
| 
       300 
317 
     | 
    
         
             
                      end
         
     | 
| 
       301 
318 
     | 
    
         | 
| 
       302 
319 
     | 
    
         
             
                      # And start new record we've encountered.
         
     | 
| 
         @@ -372,6 +389,10 @@ module Traject 
     | 
|
| 
       372 
389 
     | 
    
         
             
                  # last one
         
     | 
| 
       373 
390 
     | 
    
         
             
                  record_batch << record if record
         
     | 
| 
       374 
391 
     | 
    
         | 
| 
      
 392 
     | 
    
         
            +
                  logger.debug "HorizonReader: Waiting for threadpool work complete..."
         
     | 
| 
      
 393 
     | 
    
         
            +
                  @thread_pool.shutdown_and_wait
         
     | 
| 
      
 394 
     | 
    
         
            +
                  logger.debug "HorizonReader: threadpool work complete."
         
     | 
| 
      
 395 
     | 
    
         
            +
             
     | 
| 
       375 
396 
     | 
    
         
             
                  # yield last batch
         
     | 
| 
       376 
397 
     | 
    
         
             
                  enhance_batch!(extra_connection, record_batch)
         
     | 
| 
       377 
398 
     | 
    
         
             
                  record_batch.each do |r|
         
     | 
| 
         @@ -381,6 +402,8 @@ module Traject 
     | 
|
| 
       381 
402 
     | 
    
         
             
                  end
         
     | 
| 
       382 
403 
     | 
    
         
             
                  record_batch.clear
         
     | 
| 
       383 
404 
     | 
    
         | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
             
     | 
| 
       384 
407 
     | 
    
         
             
                rescue Exception => e
         
     | 
| 
       385 
408 
     | 
    
         
             
                  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
         
     | 
| 
       386 
409 
     | 
    
         
             
                  raise e
         
     | 
| 
         @@ -516,6 +539,9 @@ module Traject 
     | 
|
| 
       516 
539 
     | 
    
         
             
                  # It might be higher performance to refactor to re-use the same prepared statement
         
     | 
| 
       517 
540 
     | 
    
         
             
                  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
         
     | 
| 
       518 
541 
     | 
    
         
             
                  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
         
     | 
| 
      
 542 
     | 
    
         
            +
                  #
         
     | 
| 
      
 543 
     | 
    
         
            +
                  # Also, we need to be thread-safe now, so maybe better to create one
         
     | 
| 
      
 544 
     | 
    
         
            +
                  # each time, rather than risk sharing between threads accidentally. 
         
     | 
| 
       519 
545 
     | 
    
         
             
                  pstmt = conn.prepareStatement(sql);
         
     | 
| 
       520 
546 
     | 
    
         
             
                  rs = pstmt.executeQuery
         
     | 
| 
       521 
547 
     | 
    
         | 
| 
         @@ -598,6 +624,7 @@ module Traject 
     | 
|
| 
       598 
624 
     | 
    
         
             
                def self.default_settings
         
     | 
| 
       599 
625 
     | 
    
         
             
                  {
         
     | 
| 
       600 
626 
     | 
    
         
             
                    "horizon.batch_size" => 400,
         
     | 
| 
      
 627 
     | 
    
         
            +
                    "horizon.thread_pool" => 2,
         
     | 
| 
       601 
628 
     | 
    
         | 
| 
       602 
629 
     | 
    
         
             
                    "horizon.public_only" => true,
         
     | 
| 
       603 
630 
     | 
    
         | 
    
        metadata
    CHANGED
    
    | 
         @@ -2,14 +2,14 @@ 
     | 
|
| 
       2 
2 
     | 
    
         
             
            name: traject_horizon
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
4 
     | 
    
         
             
              prerelease:
         
     | 
| 
       5 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 5 
     | 
    
         
            +
              version: 0.10.0
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
       8 
8 
     | 
    
         
             
            - Jonathan Rochkind
         
     | 
| 
       9 
9 
     | 
    
         
             
            autorequire:
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2013-09- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2013-09-10 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: traject
         
     |