traject_horizon 0.10.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/traject/horizon_reader.rb +68 -81
 - data/lib/traject_horizon/version.rb +1 -1
 - metadata +2 -2
 
| 
         @@ -109,14 +109,6 @@ module Traject 
     | 
|
| 
       109 
109 
     | 
    
         
             
              # == Misc
         
     | 
| 
       110 
110 
     | 
    
         
             
              #
         
     | 
| 
       111 
111 
     | 
    
         
             
              # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
         
     | 
| 
       112 
     | 
    
         
            -
              #
         
     | 
| 
       113 
     | 
    
         
            -
              # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
         
     | 
| 
       114 
     | 
    
         
            -
              #    set to >0, the work of secondary DB queries for item/copy info and the subsequent
         
     | 
| 
       115 
     | 
    
         
            -
              #    yielding to caller will be done in threads. Playing with this number
         
     | 
| 
       116 
     | 
    
         
            -
              #    may be able to increase performance -- recommend not setting it to 0
         
     | 
| 
       117 
     | 
    
         
            -
              #    except maybe for debugging, having it at least 1 significantly improves
         
     | 
| 
       118 
     | 
    
         
            -
              #    throughput. 
         
     | 
| 
       119 
     | 
    
         
            -
              # 
         
     | 
| 
       120 
112 
     | 
    
         
             
              # [debug_ascii_progress]  if true, will output a "<" and a ">" to stderr around every copy/item
         
     | 
| 
       121 
113 
     | 
    
         
             
              #           subsidiary fetch. See description of this setting in docs/settings.md
         
     | 
| 
       122 
114 
     | 
    
         
             
              #
         
     | 
| 
         @@ -135,12 +127,6 @@ module Traject 
     | 
|
| 
       135 
127 
     | 
    
         
             
                  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
         
     | 
| 
       136 
128 
     | 
    
         | 
| 
       137 
129 
     | 
    
         
             
                  require_jars!
         
     | 
| 
       138 
     | 
    
         
            -
             
     | 
| 
       139 
     | 
    
         
            -
                  pool_size     = settings["horizon.thread_pool"].to_i
         
     | 
| 
       140 
     | 
    
         
            -
             
     | 
| 
       141 
     | 
    
         
            -
                  logger.debug("HorizonReader with thread pool size #{pool_size}")
         
     | 
| 
       142 
     | 
    
         
            -
             
     | 
| 
       143 
     | 
    
         
            -
                  @thread_pool  = Traject::ThreadPool.new( pool_size )
         
     | 
| 
       144 
130 
     | 
    
         
             
                end
         
     | 
| 
       145 
131 
     | 
    
         | 
| 
       146 
132 
     | 
    
         
             
                # Requires marc4j and jtds, and java_import's some classes.
         
     | 
| 
         @@ -293,20 +279,15 @@ module Traject 
     | 
|
| 
       293 
279 
     | 
    
         
             
                      # new record! Put old one on batch queue.
         
     | 
| 
       294 
280 
     | 
    
         
             
                      record_batch << record if record
         
     | 
| 
       295 
281 
     | 
    
         | 
| 
       296 
     | 
    
         
            -
                      # Any exceptions from background threads?
         
     | 
| 
       297 
     | 
    
         
            -
                      @thread_pool.raise_collected_exception!
         
     | 
| 
       298 
     | 
    
         
            -
             
     | 
| 
       299 
282 
     | 
    
         
             
                      # prepare and yield batch?
         
     | 
| 
       300 
283 
     | 
    
         
             
                      if (record_count % batch_size == 0)
         
     | 
| 
       301 
     | 
    
         
            -
                         
     | 
| 
       302 
     | 
    
         
            -
                        record_batch  
     | 
| 
       303 
     | 
    
         
            -
             
     | 
| 
       304 
     | 
    
         
            -
             
     | 
| 
       305 
     | 
    
         
            -
                           
     | 
| 
       306 
     | 
    
         
            -
                          batch.each do |r|
         
     | 
| 
       307 
     | 
    
         
            -
                            yield r
         
     | 
| 
       308 
     | 
    
         
            -
                          end
         
     | 
| 
      
 284 
     | 
    
         
            +
                        enhance_batch!(extra_connection, record_batch)
         
     | 
| 
      
 285 
     | 
    
         
            +
                        record_batch.each do |r|
         
     | 
| 
      
 286 
     | 
    
         
            +
                          # set current_bib_id for error logging
         
     | 
| 
      
 287 
     | 
    
         
            +
                          current_bib_id = r['001'].value
         
     | 
| 
      
 288 
     | 
    
         
            +
                          yield r
         
     | 
| 
       309 
289 
     | 
    
         
             
                        end
         
     | 
| 
      
 290 
     | 
    
         
            +
                        record_batch.clear
         
     | 
| 
       310 
291 
     | 
    
         
             
                      end
         
     | 
| 
       311 
292 
     | 
    
         | 
| 
       312 
293 
     | 
    
         
             
                      # And start new record we've encountered.
         
     | 
| 
         @@ -316,7 +297,6 @@ module Traject 
     | 
|
| 
       316 
297 
     | 
    
         
             
                      record.append MARC::ControlField.new("001", bib_id.to_s)
         
     | 
| 
       317 
298 
     | 
    
         
             
                    end
         
     | 
| 
       318 
299 
     | 
    
         | 
| 
       319 
     | 
    
         
            -
             
     | 
| 
       320 
300 
     | 
    
         
             
                    tagord      = rs.getInt("tagord");
         
     | 
| 
       321 
301 
     | 
    
         
             
                    tag         = rs.getString("tag")
         
     | 
| 
       322 
302 
     | 
    
         | 
| 
         @@ -324,8 +304,6 @@ module Traject 
     | 
|
| 
       324 
304 
     | 
    
         
             
                    # plus any of our exclude_tags.
         
     | 
| 
       325 
305 
     | 
    
         
             
                    next if tag.nil? || tag == "" || exclude_tags.include?(tag)
         
     | 
| 
       326 
306 
     | 
    
         | 
| 
       327 
     | 
    
         
            -
                    numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
         
     | 
| 
       328 
     | 
    
         
            -
             
     | 
| 
       329 
307 
     | 
    
         
             
                    indicators = rs.getString("indicators")
         
     | 
| 
       330 
308 
     | 
    
         | 
| 
       331 
309 
     | 
    
         
             
                    # a packed byte array could be in various columns, in order of preference...
         
     | 
| 
         @@ -333,61 +311,20 @@ module Traject 
     | 
|
| 
       333 
311 
     | 
    
         
             
                    # Have to get it as bytes and then convert it to String to avoid JDBC messing
         
     | 
| 
       334 
312 
     | 
    
         
             
                    # up the encoding marc8 grr
         
     | 
| 
       335 
313 
     | 
    
         
             
                    authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
         
     | 
| 
       336 
     | 
    
         
            -
                    if authtext
         
     | 
| 
       337 
     | 
    
         
            -
                      authtext = String.from_java_bytes(authtext)
         
     | 
| 
       338 
     | 
    
         
            -
                      authtext.force_encoding("binary")
         
     | 
| 
       339 
     | 
    
         
            -
                    end
         
     | 
| 
       340 
     | 
    
         
            -
             
     | 
| 
       341 
314 
     | 
    
         
             
                    text     = rs.getBytes("longtext") || rs.getBytes("text")
         
     | 
| 
       342 
     | 
    
         
            -
                    if text
         
     | 
| 
       343 
     | 
    
         
            -
                      text = String.from_java_bytes(text)
         
     | 
| 
       344 
     | 
    
         
            -
                      text.force_encoding("binary")
         
     | 
| 
       345 
     | 
    
         
            -
                    end
         
     | 
| 
       346 
     | 
    
         
            -
             
     | 
| 
       347 
     | 
    
         
            -
                    text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
         
     | 
| 
       348 
315 
     | 
    
         | 
| 
       349 
     | 
    
         
            -
                     
     | 
| 
       350 
     | 
    
         
            -
             
     | 
| 
       351 
     | 
    
         
            -
                    # convert from MARC8 to UTF8 if needed
         
     | 
| 
       352 
     | 
    
         
            -
                    text = convert_text!(text, error_handler)
         
     | 
| 
       353 
     | 
    
         
            -
             
     | 
| 
       354 
     | 
    
         
            -
                    if numeric_tag && numeric_tag == 0
         
     | 
| 
       355 
     | 
    
         
            -
                      record.leader = text
         
     | 
| 
      
 316 
     | 
    
         
            +
                    if tag == "000"
         
     | 
| 
      
 317 
     | 
    
         
            +
                      record.leader =  String.from_java_bytes text
         
     | 
| 
       356 
318 
     | 
    
         
             
                      fix_leader!(record.leader)
         
     | 
| 
       357 
     | 
    
         
            -
                    elsif  
     | 
| 
       358 
     | 
    
         
            -
                      #  
     | 
| 
       359 
     | 
    
         
            -
             
     | 
| 
       360 
     | 
    
         
            -
                      # control field
         
     | 
| 
       361 
     | 
    
         
            -
                      record.append MARC::ControlField.new(tag, text )
         
     | 
| 
       362 
     | 
    
         
            -
                    else
         
     | 
| 
       363 
     | 
    
         
            -
                      # data field
         
     | 
| 
       364 
     | 
    
         
            -
                      indicator1 = indicators.slice(0)
         
     | 
| 
       365 
     | 
    
         
            -
                      indicator2 = indicators.slice(1)
         
     | 
| 
       366 
     | 
    
         
            -
             
     | 
| 
       367 
     | 
    
         
            -
                      data_field = MARC::DataField.new(  tag,  indicator1, indicator2 )
         
     | 
| 
       368 
     | 
    
         
            -
                      record.append data_field
         
     | 
| 
       369 
     | 
    
         
            -
             
     | 
| 
       370 
     | 
    
         
            -
                      subfields  = text.split("\x1F")
         
     | 
| 
       371 
     | 
    
         
            -
             
     | 
| 
       372 
     | 
    
         
            -
                      subfields.each do |subfield|
         
     | 
| 
       373 
     | 
    
         
            -
                        next if subfield.empty?
         
     | 
| 
       374 
     | 
    
         
            -
             
     | 
| 
       375 
     | 
    
         
            -
                        subfield_code = subfield.slice(0)
         
     | 
| 
       376 
     | 
    
         
            -
                        subfield_text = subfield.slice(1, subfield.length)
         
     | 
| 
       377 
     | 
    
         
            -
             
     | 
| 
       378 
     | 
    
         
            -
                        data_field.append MARC::Subfield.new(subfield_code, subfield_text)
         
     | 
| 
       379 
     | 
    
         
            -
                      end
         
     | 
| 
      
 319 
     | 
    
         
            +
                    elsif tag != "001"
         
     | 
| 
      
 320 
     | 
    
         
            +
                      # we add an 001 ourselves with bib id in another part of code.
         
     | 
| 
      
 321 
     | 
    
         
            +
                      record.append build_marc_field!(error_handler, tag, indicators, text, authtext)
         
     | 
| 
       380 
322 
     | 
    
         
             
                    end
         
     | 
| 
       381 
323 
     | 
    
         
             
                  end
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
       382 
325 
     | 
    
         
             
                  # last one
         
     | 
| 
       383 
326 
     | 
    
         
             
                  record_batch << record if record
         
     | 
| 
       384 
327 
     | 
    
         | 
| 
       385 
     | 
    
         
            -
                  logger.debug "HorizonReader: Waiting for threadpool work complete..."
         
     | 
| 
       386 
     | 
    
         
            -
                  @thread_pool.shutdown_and_wait
         
     | 
| 
       387 
     | 
    
         
            -
                  logger.debug "HorizonReader: threadpool work complete."
         
     | 
| 
       388 
     | 
    
         
            -
                  @thread_pool.raise_collected_exception!
         
     | 
| 
       389 
     | 
    
         
            -
             
     | 
| 
       390 
     | 
    
         
            -
             
     | 
| 
       391 
328 
     | 
    
         
             
                  # yield last batch
         
     | 
| 
       392 
329 
     | 
    
         
             
                  enhance_batch!(extra_connection, record_batch)
         
     | 
| 
       393 
330 
     | 
    
         
             
                  record_batch.each do |r|
         
     | 
| 
         @@ -395,10 +332,9 @@ module Traject 
     | 
|
| 
       395 
332 
     | 
    
         
             
                  end
         
     | 
| 
       396 
333 
     | 
    
         
             
                  record_batch.clear
         
     | 
| 
       397 
334 
     | 
    
         | 
| 
       398 
     | 
    
         
            -
             
     | 
| 
       399 
     | 
    
         
            -
             
     | 
| 
       400 
335 
     | 
    
         
             
                rescue Exception => e
         
     | 
| 
       401 
336 
     | 
    
         
             
                  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
         
     | 
| 
      
 337 
     | 
    
         
            +
                  logger.fatal e.backtrace.join("/n")
         
     | 
| 
       402 
338 
     | 
    
         
             
                  raise e
         
     | 
| 
       403 
339 
     | 
    
         
             
                ensure
         
     | 
| 
       404 
340 
     | 
    
         
             
                  logger.info("HorizonReader: Closing all JDBC objects...")
         
     | 
| 
         @@ -421,6 +357,61 @@ module Traject 
     | 
|
| 
       421 
357 
     | 
    
         
             
                  logger.info("HorizonReader: Closed JDBC objects")
         
     | 
| 
       422 
358 
     | 
    
         
             
                end
         
     | 
| 
       423 
359 
     | 
    
         | 
| 
      
 360 
     | 
    
         
            +
                # Returns a DataField or ControlField, can return
         
     | 
| 
      
 361 
     | 
    
         
            +
                # nil if determined no field can/should be created.
         
     | 
| 
      
 362 
     | 
    
         
            +
                #
         
     | 
| 
      
 363 
     | 
    
         
            +
                # Do not call for field '0' (leader) or field 001,
         
     | 
| 
      
 364 
     | 
    
         
            +
                # this doesn't handle those, will just return nil.
         
     | 
| 
      
 365 
     | 
    
         
            +
                #
         
     | 
| 
      
 366 
     | 
    
         
            +
                # First arg is a Marc4J ErrorHandler object, kind of a weird implementation
         
     | 
| 
      
 367 
     | 
    
         
            +
                # detail.
         
     | 
| 
      
 368 
     | 
    
         
            +
                #
         
     | 
| 
      
 369 
     | 
    
         
            +
                # Other args are objects fetched from Horizon db via JDBC --
         
     | 
| 
      
 370 
     | 
    
         
            +
                # text and authtext must be byte arrays.
         
     | 
| 
      
 371 
     | 
    
         
            +
                def build_marc_field!(error_handler, tag, indicators, text, authtext)
         
     | 
| 
      
 372 
     | 
    
         
            +
                  # convert text and authtext from java bytes to a ruby
         
     | 
| 
      
 373 
     | 
    
         
            +
                  # binary string.
         
     | 
| 
      
 374 
     | 
    
         
            +
                  if text
         
     | 
| 
      
 375 
     | 
    
         
            +
                    text = String.from_java_bytes(text)
         
     | 
| 
      
 376 
     | 
    
         
            +
                    text.force_encoding("binary")
         
     | 
| 
      
 377 
     | 
    
         
            +
                  end
         
     | 
| 
      
 378 
     | 
    
         
            +
                  if authtext
         
     | 
| 
      
 379 
     | 
    
         
            +
                    authtext = String.from_java_bytes(authtext)
         
     | 
| 
      
 380 
     | 
    
         
            +
                    authtext.force_encoding("binary")
         
     | 
| 
      
 381 
     | 
    
         
            +
                  end
         
     | 
| 
      
 382 
     | 
    
         
            +
             
     | 
| 
      
 383 
     | 
    
         
            +
                  text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
         
     | 
| 
      
 384 
     | 
    
         
            +
             
     | 
| 
      
 385 
     | 
    
         
            +
                  return nil if text.nil? # sometimes there's nothing there, skip it.
         
     | 
| 
      
 386 
     | 
    
         
            +
             
     | 
| 
      
 387 
     | 
    
         
            +
                  # convert from MARC8 to UTF8 if needed
         
     | 
| 
      
 388 
     | 
    
         
            +
                  text = convert_text!(text, error_handler)
         
     | 
| 
      
 389 
     | 
    
         
            +
             
     | 
| 
      
 390 
     | 
    
         
            +
                  if MARC::ControlField.control_tag?(tag)
         
     | 
| 
      
 391 
     | 
    
         
            +
                    # control field
         
     | 
| 
      
 392 
     | 
    
         
            +
                    return MARC::ControlField.new(tag, text )
         
     | 
| 
      
 393 
     | 
    
         
            +
                  else
         
     | 
| 
      
 394 
     | 
    
         
            +
                    # data field
         
     | 
| 
      
 395 
     | 
    
         
            +
                    indicator1 = indicators.slice(0)
         
     | 
| 
      
 396 
     | 
    
         
            +
                    indicator2 = indicators.slice(1)
         
     | 
| 
      
 397 
     | 
    
         
            +
             
     | 
| 
      
 398 
     | 
    
         
            +
                    data_field = MARC::DataField.new(  tag,  indicator1, indicator2 )
         
     | 
| 
      
 399 
     | 
    
         
            +
             
     | 
| 
      
 400 
     | 
    
         
            +
                    subfields  = text.split("\x1F")
         
     | 
| 
      
 401 
     | 
    
         
            +
             
     | 
| 
      
 402 
     | 
    
         
            +
                    subfields.each do |subfield|
         
     | 
| 
      
 403 
     | 
    
         
            +
                      next if subfield.empty?
         
     | 
| 
      
 404 
     | 
    
         
            +
             
     | 
| 
      
 405 
     | 
    
         
            +
                      subfield_code = subfield.slice(0)
         
     | 
| 
      
 406 
     | 
    
         
            +
                      subfield_text = subfield.slice(1, subfield.length)
         
     | 
| 
      
 407 
     | 
    
         
            +
             
     | 
| 
      
 408 
     | 
    
         
            +
                      data_field.append MARC::Subfield.new(subfield_code, subfield_text)
         
     | 
| 
      
 409 
     | 
    
         
            +
                    end
         
     | 
| 
      
 410 
     | 
    
         
            +
                    return data_field
         
     | 
| 
      
 411 
     | 
    
         
            +
                  end
         
     | 
| 
      
 412 
     | 
    
         
            +
             
     | 
| 
      
 413 
     | 
    
         
            +
                end
         
     | 
| 
      
 414 
     | 
    
         
            +
             
     | 
| 
       424 
415 
     | 
    
         
             
                # Pass in an array of MARC::Records', adds fields for copy and item
         
     | 
| 
       425 
416 
     | 
    
         
             
                # info if so configured. Returns record_batch so you can chain if you want.
         
     | 
| 
       426 
417 
     | 
    
         
             
                def enhance_batch!(conn, record_batch)
         
     | 
| 
         @@ -528,9 +519,6 @@ module Traject 
     | 
|
| 
       528 
519 
     | 
    
         
             
                  # It might be higher performance to refactor to re-use the same prepared statement
         
     | 
| 
       529 
520 
     | 
    
         
             
                  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
         
     | 
| 
       530 
521 
     | 
    
         
             
                  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
         
     | 
| 
       531 
     | 
    
         
            -
                  #
         
     | 
| 
       532 
     | 
    
         
            -
                  # Also, we need to be thread-safe now, so maybe better to create one
         
     | 
| 
       533 
     | 
    
         
            -
                  # each time, rather than risk sharing between threads accidentally. 
         
     | 
| 
       534 
522 
     | 
    
         
             
                  pstmt = conn.prepareStatement(sql);
         
     | 
| 
       535 
523 
     | 
    
         
             
                  rs = pstmt.executeQuery
         
     | 
| 
       536 
524 
     | 
    
         | 
| 
         @@ -613,7 +601,6 @@ module Traject 
     | 
|
| 
       613 
601 
     | 
    
         
             
                def self.default_settings
         
     | 
| 
       614 
602 
     | 
    
         
             
                  {
         
     | 
| 
       615 
603 
     | 
    
         
             
                    "horizon.batch_size" => 400,
         
     | 
| 
       616 
     | 
    
         
            -
                    "horizon.thread_pool" => 2,
         
     | 
| 
       617 
604 
     | 
    
         | 
| 
       618 
605 
     | 
    
         
             
                    "horizon.public_only" => true,
         
     | 
| 
       619 
606 
     | 
    
         | 
    
        metadata
    CHANGED
    
    | 
         @@ -2,14 +2,14 @@ 
     | 
|
| 
       2 
2 
     | 
    
         
             
            name: traject_horizon
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
4 
     | 
    
         
             
              prerelease:
         
     | 
| 
       5 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 5 
     | 
    
         
            +
              version: 0.11.0
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
       8 
8 
     | 
    
         
             
            - Jonathan Rochkind
         
     | 
| 
       9 
9 
     | 
    
         
             
            autorequire:
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2013-09- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2013-09-12 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: traject
         
     |