traject_horizon 0.10.2 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -109,14 +109,6 @@ module Traject
109
109
  # == Misc
110
110
  #
111
111
  # [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
112
- #
113
- # [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
114
- # set to >0, the work of secondary DB queries for item/copy info and the subsequent
115
- # yielding to caller will be done in threads. Playing with this number
116
- # may be able to increase performance -- recommend not setting it to 0
117
- # except maybe for debugging, having it at least 1 significantly improves
118
- # throughput.
119
- #
120
112
  # [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
121
113
  # subsidiary fetch. See description of this setting in docs/settings.md
122
114
  #
@@ -135,12 +127,6 @@ module Traject
135
127
  @settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
136
128
 
137
129
  require_jars!
138
-
139
- pool_size = settings["horizon.thread_pool"].to_i
140
-
141
- logger.debug("HorizonReader with thread pool size #{pool_size}")
142
-
143
- @thread_pool = Traject::ThreadPool.new( pool_size )
144
130
  end
145
131
 
146
132
  # Requires marc4j and jtds, and java_import's some classes.
@@ -293,20 +279,15 @@ module Traject
293
279
  # new record! Put old one on batch queue.
294
280
  record_batch << record if record
295
281
 
296
- # Any exceptions from background threads?
297
- @thread_pool.raise_collected_exception!
298
-
299
282
  # prepare and yield batch?
300
283
  if (record_count % batch_size == 0)
301
- batch = record_batch
302
- record_batch = []
303
-
304
- @thread_pool.maybe_in_thread_pool(batch) do |batch|
305
- enhance_batch!(extra_connection, batch)
306
- batch.each do |r|
307
- yield r
308
- end
284
+ enhance_batch!(extra_connection, record_batch)
285
+ record_batch.each do |r|
286
+ # set current_bib_id for error logging
287
+ current_bib_id = r['001'].value
288
+ yield r
309
289
  end
290
+ record_batch.clear
310
291
  end
311
292
 
312
293
  # And start new record we've encountered.
@@ -316,7 +297,6 @@ module Traject
316
297
  record.append MARC::ControlField.new("001", bib_id.to_s)
317
298
  end
318
299
 
319
-
320
300
  tagord = rs.getInt("tagord");
321
301
  tag = rs.getString("tag")
322
302
 
@@ -324,8 +304,6 @@ module Traject
324
304
  # plus any of our exclude_tags.
325
305
  next if tag.nil? || tag == "" || exclude_tags.include?(tag)
326
306
 
327
- numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
328
-
329
307
  indicators = rs.getString("indicators")
330
308
 
331
309
  # a packed byte array could be in various columns, in order of preference...
@@ -333,61 +311,20 @@ module Traject
333
311
  # Have to get it as bytes and then convert it to String to avoid JDBC messing
334
312
  # up the encoding marc8 grr
335
313
  authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
336
- if authtext
337
- authtext = String.from_java_bytes(authtext)
338
- authtext.force_encoding("binary")
339
- end
340
-
341
314
  text = rs.getBytes("longtext") || rs.getBytes("text")
342
- if text
343
- text = String.from_java_bytes(text)
344
- text.force_encoding("binary")
345
- end
346
-
347
- text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
348
315
 
349
- next if text.nil? # sometimes there's nothing there, skip it.
350
-
351
- # convert from MARC8 to UTF8 if needed
352
- text = convert_text!(text, error_handler)
353
-
354
- if numeric_tag && numeric_tag == 0
355
- record.leader = text
316
+ if tag == "000"
317
+ record.leader = String.from_java_bytes text
356
318
  fix_leader!(record.leader)
357
- elsif numeric_tag && numeric_tag == 1
358
- # nothing, we add the 001 ourselves first
359
- elsif numeric_tag && numeric_tag < 10
360
- # control field
361
- record.append MARC::ControlField.new(tag, text )
362
- else
363
- # data field
364
- indicator1 = indicators.slice(0)
365
- indicator2 = indicators.slice(1)
366
-
367
- data_field = MARC::DataField.new( tag, indicator1, indicator2 )
368
- record.append data_field
369
-
370
- subfields = text.split("\x1F")
371
-
372
- subfields.each do |subfield|
373
- next if subfield.empty?
374
-
375
- subfield_code = subfield.slice(0)
376
- subfield_text = subfield.slice(1, subfield.length)
377
-
378
- data_field.append MARC::Subfield.new(subfield_code, subfield_text)
379
- end
319
+ elsif tag != "001"
320
+ # we add an 001 ourselves with bib id in another part of code.
321
+ record.append build_marc_field!(error_handler, tag, indicators, text, authtext)
380
322
  end
381
323
  end
324
+
382
325
  # last one
383
326
  record_batch << record if record
384
327
 
385
- logger.debug "HorizonReader: Waiting for threadpool work complete..."
386
- @thread_pool.shutdown_and_wait
387
- logger.debug "HorizonReader: threadpool work complete."
388
- @thread_pool.raise_collected_exception!
389
-
390
-
391
328
  # yield last batch
392
329
  enhance_batch!(extra_connection, record_batch)
393
330
  record_batch.each do |r|
@@ -395,10 +332,9 @@ module Traject
395
332
  end
396
333
  record_batch.clear
397
334
 
398
-
399
-
400
335
  rescue Exception => e
401
336
  logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
337
+ logger.fatal e.backtrace.join("/n")
402
338
  raise e
403
339
  ensure
404
340
  logger.info("HorizonReader: Closing all JDBC objects...")
@@ -421,6 +357,61 @@ module Traject
421
357
  logger.info("HorizonReader: Closed JDBC objects")
422
358
  end
423
359
 
360
+ # Returns a DataField or ControlField, can return
361
+ # nil if determined no field can/should be created.
362
+ #
363
+ # Do not call for field '0' (leader) or field 001,
364
+ # this doesn't handle those, will just return nil.
365
+ #
366
+ # First arg is a Marc4J ErrorHandler object, kind of a weird implementation
367
+ # detail.
368
+ #
369
+ # Other args are objects fetched from Horizon db via JDBC --
370
+ # text and authtext must be byte arrays.
371
+ def build_marc_field!(error_handler, tag, indicators, text, authtext)
372
+ # convert text and authtext from java bytes to a ruby
373
+ # binary string.
374
+ if text
375
+ text = String.from_java_bytes(text)
376
+ text.force_encoding("binary")
377
+ end
378
+ if authtext
379
+ authtext = String.from_java_bytes(authtext)
380
+ authtext.force_encoding("binary")
381
+ end
382
+
383
+ text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
384
+
385
+ return nil if text.nil? # sometimes there's nothing there, skip it.
386
+
387
+ # convert from MARC8 to UTF8 if needed
388
+ text = convert_text!(text, error_handler)
389
+
390
+ if MARC::ControlField.control_tag?(tag)
391
+ # control field
392
+ return MARC::ControlField.new(tag, text )
393
+ else
394
+ # data field
395
+ indicator1 = indicators.slice(0)
396
+ indicator2 = indicators.slice(1)
397
+
398
+ data_field = MARC::DataField.new( tag, indicator1, indicator2 )
399
+
400
+ subfields = text.split("\x1F")
401
+
402
+ subfields.each do |subfield|
403
+ next if subfield.empty?
404
+
405
+ subfield_code = subfield.slice(0)
406
+ subfield_text = subfield.slice(1, subfield.length)
407
+
408
+ data_field.append MARC::Subfield.new(subfield_code, subfield_text)
409
+ end
410
+ return data_field
411
+ end
412
+
413
+ end
414
+
424
415
  # Pass in an array of MARC::Records', adds fields for copy and item
425
416
  # info if so configured. Returns record_batch so you can chain if you want.
426
417
  def enhance_batch!(conn, record_batch)
@@ -528,9 +519,6 @@ module Traject
528
519
  # It might be higher performance to refactor to re-use the same prepared statement
529
520
  # for each item/copy fetch... but appears to be no great way to do that in JDBC3
530
521
  # where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
531
- #
532
- # Also, we need to be thread-safe now, so maybe better to create one
533
- # each time, rather than risk sharing between threads accidentally.
534
522
  pstmt = conn.prepareStatement(sql);
535
523
  rs = pstmt.executeQuery
536
524
 
@@ -613,7 +601,6 @@ module Traject
613
601
  def self.default_settings
614
602
  {
615
603
  "horizon.batch_size" => 400,
616
- "horizon.thread_pool" => 2,
617
604
 
618
605
  "horizon.public_only" => true,
619
606
 
@@ -1,3 +1,3 @@
1
1
  module TrajectHorizon
2
- VERSION = "0.10.2"
2
+ VERSION = "0.11.0"
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject_horizon
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.10.2
5
+ version: 0.11.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-10 00:00:00.000000000 Z
12
+ date: 2013-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: traject