traject_horizon 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/traject/horizon_reader.rb +33 -6
- data/lib/traject_horizon/version.rb +1 -1
- metadata +2 -2
@@ -109,6 +109,14 @@ module Traject
|
|
109
109
|
# == Misc
|
110
110
|
#
|
111
111
|
# [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
|
112
|
+
#
|
113
|
+
# [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
|
114
|
+
# set to >0, the work of secondary DB queries for item/copy info and the subsequent
|
115
|
+
# yielding to caller will be done in threads. Playing with this number
|
116
|
+
# may be able to increase performance -- recommend not setting it to 0
|
117
|
+
# except maybe for debugging, having it at least 1 significantly improves
|
118
|
+
# throughput.
|
119
|
+
#
|
112
120
|
# [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
|
113
121
|
# subsidiary fetch. See description of this setting in docs/settings.md
|
114
122
|
#
|
@@ -136,6 +144,8 @@ module Traject
|
|
136
144
|
@settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
|
137
145
|
|
138
146
|
require_jars!
|
147
|
+
|
148
|
+
@thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
|
139
149
|
end
|
140
150
|
|
141
151
|
# Requires marc4j and jtds, and java_import's some classes.
|
@@ -288,15 +298,22 @@ module Traject
|
|
288
298
|
# new record! Put old one on batch queue.
|
289
299
|
record_batch << record if record
|
290
300
|
|
301
|
+
# Any exceptions from background threads?
|
302
|
+
@thread_pool.raise_collected_exception!
|
303
|
+
|
291
304
|
# prepare and yield batch?
|
292
305
|
if (record_count % batch_size == 0)
|
293
|
-
|
294
|
-
record_batch
|
295
|
-
|
296
|
-
|
297
|
-
|
306
|
+
batch = record_batch
|
307
|
+
record_batch = []
|
308
|
+
|
309
|
+
@thread_pool.maybe_in_thread_pool(batch) do |batch|
|
310
|
+
enhance_batch!(extra_connection, batch)
|
311
|
+
batch.each do |r|
|
312
|
+
# set current_bib_id for error logging
|
313
|
+
current_bib_id = r['001'].value
|
314
|
+
yield r
|
315
|
+
end
|
298
316
|
end
|
299
|
-
record_batch.clear
|
300
317
|
end
|
301
318
|
|
302
319
|
# And start new record we've encountered.
|
@@ -372,6 +389,10 @@ module Traject
|
|
372
389
|
# last one
|
373
390
|
record_batch << record if record
|
374
391
|
|
392
|
+
logger.debug "HorizonReader: Waiting for threadpool work complete..."
|
393
|
+
@thread_pool.shutdown_and_wait
|
394
|
+
logger.debug "HorizonReader: threadpool work complete."
|
395
|
+
|
375
396
|
# yield last batch
|
376
397
|
enhance_batch!(extra_connection, record_batch)
|
377
398
|
record_batch.each do |r|
|
@@ -381,6 +402,8 @@ module Traject
|
|
381
402
|
end
|
382
403
|
record_batch.clear
|
383
404
|
|
405
|
+
|
406
|
+
|
384
407
|
rescue Exception => e
|
385
408
|
logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
|
386
409
|
raise e
|
@@ -516,6 +539,9 @@ module Traject
|
|
516
539
|
# It might be higher performance to refactor to re-use the same prepared statement
|
517
540
|
# for each item/copy fetch... but appears to be no great way to do that in JDBC3
|
518
541
|
# where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
|
542
|
+
#
|
543
|
+
# Also, we need to be thread-safe now, so maybe better to create one
|
544
|
+
# each time, rather than risk sharing between threads accidentally.
|
519
545
|
pstmt = conn.prepareStatement(sql);
|
520
546
|
rs = pstmt.executeQuery
|
521
547
|
|
@@ -598,6 +624,7 @@ module Traject
|
|
598
624
|
def self.default_settings
|
599
625
|
{
|
600
626
|
"horizon.batch_size" => 400,
|
627
|
+
"horizon.thread_pool" => 2,
|
601
628
|
|
602
629
|
"horizon.public_only" => true,
|
603
630
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject_horizon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.10.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: traject
|