traject_horizon 0.10.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,14 +109,6 @@ module Traject
109
109
  # == Misc
110
110
  #
111
111
  # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
112
- #
113
- # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
114
- # set to >0, the work of secondary DB queries for item/copy info and the subsequent
115
- # yielding to caller will be done in threads. Playing with this number
116
- # may be able to increase performance -- recommend not setting it to 0
117
- # except maybe for debugging, having it at least 1 significantly improves
118
- # throughput.
119
- #
120
112
  # [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
121
113
  # subsidiary fetch. See description of this setting in docs/settings.md
122
114
  #
@@ -135,12 +127,6 @@ module Traject
135
127
  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
136
128
 
137
129
  require_jars!
138
-
139
- pool_size = settings["horizon.thread_pool"].to_i
140
-
141
- logger.debug("HorizonReader with thread pool size #{pool_size}")
142
-
143
- @thread_pool = Traject::ThreadPool.new( pool_size )
144
130
  end
145
131
 
146
132
  # Requires marc4j and jtds, and java_import's some classes.
@@ -293,20 +279,15 @@ module Traject
293
279
  # new record! Put old one on batch queue.
294
280
  record_batch << record if record
295
281
 
296
- # Any exceptions from background threads?
297
- @thread_pool.raise_collected_exception!
298
-
299
282
  # prepare and yield batch?
300
283
  if (record_count % batch_size == 0)
301
- batch = record_batch
302
- record_batch = []
303
-
304
- @thread_pool.maybe_in_thread_pool(batch) do |batch|
305
- enhance_batch!(extra_connection, batch)
306
- batch.each do |r|
307
- yield r
308
- end
284
+ enhance_batch!(extra_connection, record_batch)
285
+ record_batch.each do |r|
286
+ # set current_bib_id for error logging
287
+ current_bib_id = r['001'].value
288
+ yield r
309
289
  end
290
+ record_batch.clear
310
291
  end
311
292
 
312
293
  # And start new record we've encountered.
@@ -316,7 +297,6 @@ module Traject
316
297
  record.append MARC::ControlField.new("001", bib_id.to_s)
317
298
  end
318
299
 
319
-
320
300
  tagord = rs.getInt("tagord");
321
301
  tag = rs.getString("tag")
322
302
 
@@ -324,8 +304,6 @@ module Traject
324
304
  # plus any of our exclude_tags.
325
305
  next if tag.nil? || tag == "" || exclude_tags.include?(tag)
326
306
 
327
- numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
328
-
329
307
  indicators = rs.getString("indicators")
330
308
 
331
309
  # a packed byte array could be in various columns, in order of preference...
@@ -333,61 +311,20 @@ module Traject
333
311
  # Have to get it as bytes and then convert it to String to avoid JDBC messing
334
312
  # up the encoding marc8 grr
335
313
  authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
336
- if authtext
337
- authtext = String.from_java_bytes(authtext)
338
- authtext.force_encoding("binary")
339
- end
340
-
341
314
  text = rs.getBytes("longtext") || rs.getBytes("text")
342
- if text
343
- text = String.from_java_bytes(text)
344
- text.force_encoding("binary")
345
- end
346
-
347
- text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
348
315
 
349
- next if text.nil? # sometimes there's nothing there, skip it.
350
-
351
- # convert from MARC8 to UTF8 if needed
352
- text = convert_text!(text, error_handler)
353
-
354
- if numeric_tag && numeric_tag == 0
355
- record.leader = text
316
+ if tag == "000"
317
+ record.leader = String.from_java_bytes text
356
318
  fix_leader!(record.leader)
357
- elsif numeric_tag && numeric_tag == 1
358
- # nothing, we add the 001 ourselves first
359
- elsif numeric_tag && numeric_tag < 10
360
- # control field
361
- record.append MARC::ControlField.new(tag, text )
362
- else
363
- # data field
364
- indicator1 = indicators.slice(0)
365
- indicator2 = indicators.slice(1)
366
-
367
- data_field = MARC::DataField.new( tag, indicator1, indicator2 )
368
- record.append data_field
369
-
370
- subfields = text.split("\x1F")
371
-
372
- subfields.each do |subfield|
373
- next if subfield.empty?
374
-
375
- subfield_code = subfield.slice(0)
376
- subfield_text = subfield.slice(1, subfield.length)
377
-
378
- data_field.append MARC::Subfield.new(subfield_code, subfield_text)
379
- end
319
+ elsif tag != "001"
320
+ # we add an 001 ourselves with bib id in another part of code.
321
+ record.append build_marc_field!(error_handler, tag, indicators, text, authtext)
380
322
  end
381
323
  end
324
+
382
325
  # last one
383
326
  record_batch << record if record
384
327
 
385
- logger.debug "HorizonReader: Waiting for threadpool work complete..."
386
- @thread_pool.shutdown_and_wait
387
- logger.debug "HorizonReader: threadpool work complete."
388
- @thread_pool.raise_collected_exception!
389
-
390
-
391
328
  # yield last batch
392
329
  enhance_batch!(extra_connection, record_batch)
393
330
  record_batch.each do |r|
@@ -395,10 +332,9 @@ module Traject
395
332
  end
396
333
  record_batch.clear
397
334
 
398
-
399
-
400
335
  rescue Exception => e
401
336
  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
337
+ logger.fatal e.backtrace.join("/n")
402
338
  raise e
403
339
  ensure
404
340
  logger.info("HorizonReader: Closing all JDBC objects...")
@@ -421,6 +357,61 @@ module Traject
421
357
  logger.info("HorizonReader: Closed JDBC objects")
422
358
  end
423
359
 
360
+ # Returns a DataField or ControlField, can return
361
+ # nil if determined no field can/should be created.
362
+ #
363
+ # Do not call for field '0' (leader) or field 001,
364
+ # this doesn't handle those, will just return nil.
365
+ #
366
+ # First arg is a Marc4J ErrorHandler object, kind of a weird implementation
367
+ # detail.
368
+ #
369
+ # Other args are objects fetched from Horizon db via JDBC --
370
+ # text and authtext must be byte arrays.
371
+ def build_marc_field!(error_handler, tag, indicators, text, authtext)
372
+ # convert text and authtext from java bytes to a ruby
373
+ # binary string.
374
+ if text
375
+ text = String.from_java_bytes(text)
376
+ text.force_encoding("binary")
377
+ end
378
+ if authtext
379
+ authtext = String.from_java_bytes(authtext)
380
+ authtext.force_encoding("binary")
381
+ end
382
+
383
+ text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
384
+
385
+ return nil if text.nil? # sometimes there's nothing there, skip it.
386
+
387
+ # convert from MARC8 to UTF8 if needed
388
+ text = convert_text!(text, error_handler)
389
+
390
+ if MARC::ControlField.control_tag?(tag)
391
+ # control field
392
+ return MARC::ControlField.new(tag, text )
393
+ else
394
+ # data field
395
+ indicator1 = indicators.slice(0)
396
+ indicator2 = indicators.slice(1)
397
+
398
+ data_field = MARC::DataField.new( tag, indicator1, indicator2 )
399
+
400
+ subfields = text.split("\x1F")
401
+
402
+ subfields.each do |subfield|
403
+ next if subfield.empty?
404
+
405
+ subfield_code = subfield.slice(0)
406
+ subfield_text = subfield.slice(1, subfield.length)
407
+
408
+ data_field.append MARC::Subfield.new(subfield_code, subfield_text)
409
+ end
410
+ return data_field
411
+ end
412
+
413
+ end
414
+
424
415
  # Pass in an array of MARC::Records', adds fields for copy and item
425
416
  # info if so configured. Returns record_batch so you can chain if you want.
426
417
  def enhance_batch!(conn, record_batch)
@@ -528,9 +519,6 @@ module Traject
528
519
  # It might be higher performance to refactor to re-use the same prepared statement
529
520
  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
530
521
  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
531
- #
532
- # Also, we need to be thread-safe now, so maybe better to create one
533
- # each time, rather than risk sharing between threads accidentally.
534
522
  pstmt = conn.prepareStatement(sql);
535
523
  rs = pstmt.executeQuery
536
524
 
@@ -613,7 +601,6 @@ module Traject
613
601
  def self.default_settings
614
602
  {
615
603
  "horizon.batch_size" => 400,
616
- "horizon.thread_pool" => 2,
617
604
 
618
605
  "horizon.public_only" => true,
619
606
 
@@ -1,3 +1,3 @@
1
1
  module TrajectHorizon
2
- VERSION = "0.10.2"
2
+ VERSION = "0.11.0"
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject_horizon
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.10.2
5
+ version: 0.11.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-10 00:00:00.000000000 Z
12
+ date: 2013-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: traject