traject_horizon 0.10.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/traject/horizon_reader.rb +68 -81
- data/lib/traject_horizon/version.rb +1 -1
- metadata +2 -2
@@ -109,14 +109,6 @@ module Traject
|
|
109
109
|
# == Misc
|
110
110
|
#
|
111
111
|
# [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
|
112
|
-
#
|
113
|
-
# [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
|
114
|
-
# set to >0, the work of secondary DB queries for item/copy info and the subsequent
|
115
|
-
# yielding to caller will be done in threads. Playing with this number
|
116
|
-
# may be able to increase performance -- recommend not setting it to 0
|
117
|
-
# except maybe for debugging, having it at least 1 significantly improves
|
118
|
-
# throughput.
|
119
|
-
#
|
120
112
|
# [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
|
121
113
|
# subsidiary fetch. See description of this setting in docs/settings.md
|
122
114
|
#
|
@@ -135,12 +127,6 @@ module Traject
|
|
135
127
|
@settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
|
136
128
|
|
137
129
|
require_jars!
|
138
|
-
|
139
|
-
pool_size = settings["horizon.thread_pool"].to_i
|
140
|
-
|
141
|
-
logger.debug("HorizonReader with thread pool size #{pool_size}")
|
142
|
-
|
143
|
-
@thread_pool = Traject::ThreadPool.new( pool_size )
|
144
130
|
end
|
145
131
|
|
146
132
|
# Requires marc4j and jtds, and java_import's some classes.
|
@@ -293,20 +279,15 @@ module Traject
|
|
293
279
|
# new record! Put old one on batch queue.
|
294
280
|
record_batch << record if record
|
295
281
|
|
296
|
-
# Any exceptions from background threads?
|
297
|
-
@thread_pool.raise_collected_exception!
|
298
|
-
|
299
282
|
# prepare and yield batch?
|
300
283
|
if (record_count % batch_size == 0)
|
301
|
-
|
302
|
-
record_batch
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
batch.each do |r|
|
307
|
-
yield r
|
308
|
-
end
|
284
|
+
enhance_batch!(extra_connection, record_batch)
|
285
|
+
record_batch.each do |r|
|
286
|
+
# set current_bib_id for error logging
|
287
|
+
current_bib_id = r['001'].value
|
288
|
+
yield r
|
309
289
|
end
|
290
|
+
record_batch.clear
|
310
291
|
end
|
311
292
|
|
312
293
|
# And start new record we've encountered.
|
@@ -316,7 +297,6 @@ module Traject
|
|
316
297
|
record.append MARC::ControlField.new("001", bib_id.to_s)
|
317
298
|
end
|
318
299
|
|
319
|
-
|
320
300
|
tagord = rs.getInt("tagord");
|
321
301
|
tag = rs.getString("tag")
|
322
302
|
|
@@ -324,8 +304,6 @@ module Traject
|
|
324
304
|
# plus any of our exclude_tags.
|
325
305
|
next if tag.nil? || tag == "" || exclude_tags.include?(tag)
|
326
306
|
|
327
|
-
numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
|
328
|
-
|
329
307
|
indicators = rs.getString("indicators")
|
330
308
|
|
331
309
|
# a packed byte array could be in various columns, in order of preference...
|
@@ -333,61 +311,20 @@ module Traject
|
|
333
311
|
# Have to get it as bytes and then convert it to String to avoid JDBC messing
|
334
312
|
# up the encoding marc8 grr
|
335
313
|
authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
|
336
|
-
if authtext
|
337
|
-
authtext = String.from_java_bytes(authtext)
|
338
|
-
authtext.force_encoding("binary")
|
339
|
-
end
|
340
|
-
|
341
314
|
text = rs.getBytes("longtext") || rs.getBytes("text")
|
342
|
-
if text
|
343
|
-
text = String.from_java_bytes(text)
|
344
|
-
text.force_encoding("binary")
|
345
|
-
end
|
346
|
-
|
347
|
-
text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
|
348
315
|
|
349
|
-
|
350
|
-
|
351
|
-
# convert from MARC8 to UTF8 if needed
|
352
|
-
text = convert_text!(text, error_handler)
|
353
|
-
|
354
|
-
if numeric_tag && numeric_tag == 0
|
355
|
-
record.leader = text
|
316
|
+
if tag == "000"
|
317
|
+
record.leader = String.from_java_bytes text
|
356
318
|
fix_leader!(record.leader)
|
357
|
-
elsif
|
358
|
-
#
|
359
|
-
|
360
|
-
# control field
|
361
|
-
record.append MARC::ControlField.new(tag, text )
|
362
|
-
else
|
363
|
-
# data field
|
364
|
-
indicator1 = indicators.slice(0)
|
365
|
-
indicator2 = indicators.slice(1)
|
366
|
-
|
367
|
-
data_field = MARC::DataField.new( tag, indicator1, indicator2 )
|
368
|
-
record.append data_field
|
369
|
-
|
370
|
-
subfields = text.split("\x1F")
|
371
|
-
|
372
|
-
subfields.each do |subfield|
|
373
|
-
next if subfield.empty?
|
374
|
-
|
375
|
-
subfield_code = subfield.slice(0)
|
376
|
-
subfield_text = subfield.slice(1, subfield.length)
|
377
|
-
|
378
|
-
data_field.append MARC::Subfield.new(subfield_code, subfield_text)
|
379
|
-
end
|
319
|
+
elsif tag != "001"
|
320
|
+
# we add an 001 ourselves with bib id in another part of code.
|
321
|
+
record.append build_marc_field!(error_handler, tag, indicators, text, authtext)
|
380
322
|
end
|
381
323
|
end
|
324
|
+
|
382
325
|
# last one
|
383
326
|
record_batch << record if record
|
384
327
|
|
385
|
-
logger.debug "HorizonReader: Waiting for threadpool work complete..."
|
386
|
-
@thread_pool.shutdown_and_wait
|
387
|
-
logger.debug "HorizonReader: threadpool work complete."
|
388
|
-
@thread_pool.raise_collected_exception!
|
389
|
-
|
390
|
-
|
391
328
|
# yield last batch
|
392
329
|
enhance_batch!(extra_connection, record_batch)
|
393
330
|
record_batch.each do |r|
|
@@ -395,10 +332,9 @@ module Traject
|
|
395
332
|
end
|
396
333
|
record_batch.clear
|
397
334
|
|
398
|
-
|
399
|
-
|
400
335
|
rescue Exception => e
|
401
336
|
logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
|
337
|
+
logger.fatal e.backtrace.join("/n")
|
402
338
|
raise e
|
403
339
|
ensure
|
404
340
|
logger.info("HorizonReader: Closing all JDBC objects...")
|
@@ -421,6 +357,61 @@ module Traject
|
|
421
357
|
logger.info("HorizonReader: Closed JDBC objects")
|
422
358
|
end
|
423
359
|
|
360
|
+
# Returns a DataField or ControlField, can return
|
361
|
+
# nil if determined no field can/should be created.
|
362
|
+
#
|
363
|
+
# Do not call for field '0' (leader) or field 001,
|
364
|
+
# this doesn't handle those, will just return nil.
|
365
|
+
#
|
366
|
+
# First arg is a Marc4J ErrorHandler object, kind of a weird implementation
|
367
|
+
# detail.
|
368
|
+
#
|
369
|
+
# Other args are objects fetched from Horizon db via JDBC --
|
370
|
+
# text and authtext must be byte arrays.
|
371
|
+
def build_marc_field!(error_handler, tag, indicators, text, authtext)
|
372
|
+
# convert text and authtext from java bytes to a ruby
|
373
|
+
# binary string.
|
374
|
+
if text
|
375
|
+
text = String.from_java_bytes(text)
|
376
|
+
text.force_encoding("binary")
|
377
|
+
end
|
378
|
+
if authtext
|
379
|
+
authtext = String.from_java_bytes(authtext)
|
380
|
+
authtext.force_encoding("binary")
|
381
|
+
end
|
382
|
+
|
383
|
+
text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
|
384
|
+
|
385
|
+
return nil if text.nil? # sometimes there's nothing there, skip it.
|
386
|
+
|
387
|
+
# convert from MARC8 to UTF8 if needed
|
388
|
+
text = convert_text!(text, error_handler)
|
389
|
+
|
390
|
+
if MARC::ControlField.control_tag?(tag)
|
391
|
+
# control field
|
392
|
+
return MARC::ControlField.new(tag, text )
|
393
|
+
else
|
394
|
+
# data field
|
395
|
+
indicator1 = indicators.slice(0)
|
396
|
+
indicator2 = indicators.slice(1)
|
397
|
+
|
398
|
+
data_field = MARC::DataField.new( tag, indicator1, indicator2 )
|
399
|
+
|
400
|
+
subfields = text.split("\x1F")
|
401
|
+
|
402
|
+
subfields.each do |subfield|
|
403
|
+
next if subfield.empty?
|
404
|
+
|
405
|
+
subfield_code = subfield.slice(0)
|
406
|
+
subfield_text = subfield.slice(1, subfield.length)
|
407
|
+
|
408
|
+
data_field.append MARC::Subfield.new(subfield_code, subfield_text)
|
409
|
+
end
|
410
|
+
return data_field
|
411
|
+
end
|
412
|
+
|
413
|
+
end
|
414
|
+
|
424
415
|
# Pass in an array of MARC::Records', adds fields for copy and item
|
425
416
|
# info if so configured. Returns record_batch so you can chain if you want.
|
426
417
|
def enhance_batch!(conn, record_batch)
|
@@ -528,9 +519,6 @@ module Traject
|
|
528
519
|
# It might be higher performance to refactor to re-use the same prepared statement
|
529
520
|
# for each item/copy fetch... but appears to be no great way to do that in JDBC3
|
530
521
|
# where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
|
531
|
-
#
|
532
|
-
# Also, we need to be thread-safe now, so maybe better to create one
|
533
|
-
# each time, rather than risk sharing between threads accidentally.
|
534
522
|
pstmt = conn.prepareStatement(sql);
|
535
523
|
rs = pstmt.executeQuery
|
536
524
|
|
@@ -613,7 +601,6 @@ module Traject
|
|
613
601
|
def self.default_settings
|
614
602
|
{
|
615
603
|
"horizon.batch_size" => 400,
|
616
|
-
"horizon.thread_pool" => 2,
|
617
604
|
|
618
605
|
"horizon.public_only" => true,
|
619
606
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject_horizon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.11.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: traject
|