traject_horizon 0.10.2 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/traject/horizon_reader.rb +68 -81
- data/lib/traject_horizon/version.rb +1 -1
- metadata +2 -2
@@ -109,14 +109,6 @@ module Traject
|
|
109
109
|
# == Misc
|
110
110
|
#
|
111
111
|
# [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
|
112
|
-
#
|
113
|
-
# [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
|
114
|
-
# set to >0, the work of secondary DB queries for item/copy info and the subsequent
|
115
|
-
# yielding to caller will be done in threads. Playing with this number
|
116
|
-
# may be able to increase performance -- recommend not setting it to 0
|
117
|
-
# except maybe for debugging, having it at least 1 significantly improves
|
118
|
-
# throughput.
|
119
|
-
#
|
120
112
|
# [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
|
121
113
|
# subsidiary fetch. See description of this setting in docs/settings.md
|
122
114
|
#
|
@@ -135,12 +127,6 @@ module Traject
|
|
135
127
|
@settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
|
136
128
|
|
137
129
|
require_jars!
|
138
|
-
|
139
|
-
pool_size = settings["horizon.thread_pool"].to_i
|
140
|
-
|
141
|
-
logger.debug("HorizonReader with thread pool size #{pool_size}")
|
142
|
-
|
143
|
-
@thread_pool = Traject::ThreadPool.new( pool_size )
|
144
130
|
end
|
145
131
|
|
146
132
|
# Requires marc4j and jtds, and java_import's some classes.
|
@@ -293,20 +279,15 @@ module Traject
|
|
293
279
|
# new record! Put old one on batch queue.
|
294
280
|
record_batch << record if record
|
295
281
|
|
296
|
-
# Any exceptions from background threads?
|
297
|
-
@thread_pool.raise_collected_exception!
|
298
|
-
|
299
282
|
# prepare and yield batch?
|
300
283
|
if (record_count % batch_size == 0)
|
301
|
-
|
302
|
-
record_batch
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
batch.each do |r|
|
307
|
-
yield r
|
308
|
-
end
|
284
|
+
enhance_batch!(extra_connection, record_batch)
|
285
|
+
record_batch.each do |r|
|
286
|
+
# set current_bib_id for error logging
|
287
|
+
current_bib_id = r['001'].value
|
288
|
+
yield r
|
309
289
|
end
|
290
|
+
record_batch.clear
|
310
291
|
end
|
311
292
|
|
312
293
|
# And start new record we've encountered.
|
@@ -316,7 +297,6 @@ module Traject
|
|
316
297
|
record.append MARC::ControlField.new("001", bib_id.to_s)
|
317
298
|
end
|
318
299
|
|
319
|
-
|
320
300
|
tagord = rs.getInt("tagord");
|
321
301
|
tag = rs.getString("tag")
|
322
302
|
|
@@ -324,8 +304,6 @@ module Traject
|
|
324
304
|
# plus any of our exclude_tags.
|
325
305
|
next if tag.nil? || tag == "" || exclude_tags.include?(tag)
|
326
306
|
|
327
|
-
numeric_tag = tag.to_i if tag =~ /\A\d+\Z/
|
328
|
-
|
329
307
|
indicators = rs.getString("indicators")
|
330
308
|
|
331
309
|
# a packed byte array could be in various columns, in order of preference...
|
@@ -333,61 +311,20 @@ module Traject
|
|
333
311
|
# Have to get it as bytes and then convert it to String to avoid JDBC messing
|
334
312
|
# up the encoding marc8 grr
|
335
313
|
authtext = rs.getBytes("xref_longtext") || rs.getBytes("xref_text")
|
336
|
-
if authtext
|
337
|
-
authtext = String.from_java_bytes(authtext)
|
338
|
-
authtext.force_encoding("binary")
|
339
|
-
end
|
340
|
-
|
341
314
|
text = rs.getBytes("longtext") || rs.getBytes("text")
|
342
|
-
if text
|
343
|
-
text = String.from_java_bytes(text)
|
344
|
-
text.force_encoding("binary")
|
345
|
-
end
|
346
|
-
|
347
|
-
text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
|
348
315
|
|
349
|
-
|
350
|
-
|
351
|
-
# convert from MARC8 to UTF8 if needed
|
352
|
-
text = convert_text!(text, error_handler)
|
353
|
-
|
354
|
-
if numeric_tag && numeric_tag == 0
|
355
|
-
record.leader = text
|
316
|
+
if tag == "000"
|
317
|
+
record.leader = String.from_java_bytes text
|
356
318
|
fix_leader!(record.leader)
|
357
|
-
elsif
|
358
|
-
#
|
359
|
-
|
360
|
-
# control field
|
361
|
-
record.append MARC::ControlField.new(tag, text )
|
362
|
-
else
|
363
|
-
# data field
|
364
|
-
indicator1 = indicators.slice(0)
|
365
|
-
indicator2 = indicators.slice(1)
|
366
|
-
|
367
|
-
data_field = MARC::DataField.new( tag, indicator1, indicator2 )
|
368
|
-
record.append data_field
|
369
|
-
|
370
|
-
subfields = text.split("\x1F")
|
371
|
-
|
372
|
-
subfields.each do |subfield|
|
373
|
-
next if subfield.empty?
|
374
|
-
|
375
|
-
subfield_code = subfield.slice(0)
|
376
|
-
subfield_text = subfield.slice(1, subfield.length)
|
377
|
-
|
378
|
-
data_field.append MARC::Subfield.new(subfield_code, subfield_text)
|
379
|
-
end
|
319
|
+
elsif tag != "001"
|
320
|
+
# we add an 001 ourselves with bib id in another part of code.
|
321
|
+
record.append build_marc_field!(error_handler, tag, indicators, text, authtext)
|
380
322
|
end
|
381
323
|
end
|
324
|
+
|
382
325
|
# last one
|
383
326
|
record_batch << record if record
|
384
327
|
|
385
|
-
logger.debug "HorizonReader: Waiting for threadpool work complete..."
|
386
|
-
@thread_pool.shutdown_and_wait
|
387
|
-
logger.debug "HorizonReader: threadpool work complete."
|
388
|
-
@thread_pool.raise_collected_exception!
|
389
|
-
|
390
|
-
|
391
328
|
# yield last batch
|
392
329
|
enhance_batch!(extra_connection, record_batch)
|
393
330
|
record_batch.each do |r|
|
@@ -395,10 +332,9 @@ module Traject
|
|
395
332
|
end
|
396
333
|
record_batch.clear
|
397
334
|
|
398
|
-
|
399
|
-
|
400
335
|
rescue Exception => e
|
401
336
|
logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
|
337
|
+
logger.fatal e.backtrace.join("/n")
|
402
338
|
raise e
|
403
339
|
ensure
|
404
340
|
logger.info("HorizonReader: Closing all JDBC objects...")
|
@@ -421,6 +357,61 @@ module Traject
|
|
421
357
|
logger.info("HorizonReader: Closed JDBC objects")
|
422
358
|
end
|
423
359
|
|
360
|
+
# Returns a DataField or ControlField, can return
|
361
|
+
# nil if determined no field can/should be created.
|
362
|
+
#
|
363
|
+
# Do not call for field '0' (leader) or field 001,
|
364
|
+
# this doesn't handle those, will just return nil.
|
365
|
+
#
|
366
|
+
# First arg is a Marc4J ErrorHandler object, kind of a weird implementation
|
367
|
+
# detail.
|
368
|
+
#
|
369
|
+
# Other args are objects fetched from Horizon db via JDBC --
|
370
|
+
# text and authtext must be byte arrays.
|
371
|
+
def build_marc_field!(error_handler, tag, indicators, text, authtext)
|
372
|
+
# convert text and authtext from java bytes to a ruby
|
373
|
+
# binary string.
|
374
|
+
if text
|
375
|
+
text = String.from_java_bytes(text)
|
376
|
+
text.force_encoding("binary")
|
377
|
+
end
|
378
|
+
if authtext
|
379
|
+
authtext = String.from_java_bytes(authtext)
|
380
|
+
authtext.force_encoding("binary")
|
381
|
+
end
|
382
|
+
|
383
|
+
text = Traject::HorizonBibAuthMerge.new(tag, text, authtext).merge!
|
384
|
+
|
385
|
+
return nil if text.nil? # sometimes there's nothing there, skip it.
|
386
|
+
|
387
|
+
# convert from MARC8 to UTF8 if needed
|
388
|
+
text = convert_text!(text, error_handler)
|
389
|
+
|
390
|
+
if MARC::ControlField.control_tag?(tag)
|
391
|
+
# control field
|
392
|
+
return MARC::ControlField.new(tag, text )
|
393
|
+
else
|
394
|
+
# data field
|
395
|
+
indicator1 = indicators.slice(0)
|
396
|
+
indicator2 = indicators.slice(1)
|
397
|
+
|
398
|
+
data_field = MARC::DataField.new( tag, indicator1, indicator2 )
|
399
|
+
|
400
|
+
subfields = text.split("\x1F")
|
401
|
+
|
402
|
+
subfields.each do |subfield|
|
403
|
+
next if subfield.empty?
|
404
|
+
|
405
|
+
subfield_code = subfield.slice(0)
|
406
|
+
subfield_text = subfield.slice(1, subfield.length)
|
407
|
+
|
408
|
+
data_field.append MARC::Subfield.new(subfield_code, subfield_text)
|
409
|
+
end
|
410
|
+
return data_field
|
411
|
+
end
|
412
|
+
|
413
|
+
end
|
414
|
+
|
424
415
|
# Pass in an array of MARC::Records', adds fields for copy and item
|
425
416
|
# info if so configured. Returns record_batch so you can chain if you want.
|
426
417
|
def enhance_batch!(conn, record_batch)
|
@@ -528,9 +519,6 @@ module Traject
|
|
528
519
|
# It might be higher performance to refactor to re-use the same prepared statement
|
529
520
|
# for each item/copy fetch... but appears to be no great way to do that in JDBC3
|
530
521
|
# where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
|
531
|
-
#
|
532
|
-
# Also, we need to be thread-safe now, so maybe better to create one
|
533
|
-
# each time, rather than risk sharing between threads accidentally.
|
534
522
|
pstmt = conn.prepareStatement(sql);
|
535
523
|
rs = pstmt.executeQuery
|
536
524
|
|
@@ -613,7 +601,6 @@ module Traject
|
|
613
601
|
def self.default_settings
|
614
602
|
{
|
615
603
|
"horizon.batch_size" => 400,
|
616
|
-
"horizon.thread_pool" => 2,
|
617
604
|
|
618
605
|
"horizon.public_only" => true,
|
619
606
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject_horizon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.11.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: traject
|