traject_horizon 0.9.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/traject/horizon_reader.rb +33 -6
- data/lib/traject_horizon/version.rb +1 -1
- metadata +2 -2
@@ -109,6 +109,14 @@ module Traject
|
|
109
109
|
# == Misc
|
110
110
|
#
|
111
111
|
# [horizon.batch_size] Batch size to use for fetching item/copy info on each bib. Default 400.
|
112
|
+
#
|
113
|
+
# [horizon.thread_pool] Default 2. HorizonReader uses threads to add some concurrency. If
|
114
|
+
# set to >0, the work of secondary DB queries for item/copy info and the subsequent
|
115
|
+
# yielding to caller will be done in threads. Playing with this number
|
116
|
+
# may be able to increase performance -- recommend not setting it to 0
|
117
|
+
# except maybe for debugging, having it at least 1 significantly improves
|
118
|
+
# throughput.
|
119
|
+
#
|
112
120
|
# [debug_ascii_progress] if true, will output a "<" and a ">" to stderr around every copy/item
|
113
121
|
# subsidiary fetch. See description of this setting in docs/settings.md
|
114
122
|
#
|
@@ -136,6 +144,8 @@ module Traject
|
|
136
144
|
@settings = Traject::Indexer::Settings.new( self.class.default_settings).merge(settings)
|
137
145
|
|
138
146
|
require_jars!
|
147
|
+
|
148
|
+
@thread_pool = Traject::ThreadPool.new( settings["horizon.thread_pool"].to_i )
|
139
149
|
end
|
140
150
|
|
141
151
|
# Requires marc4j and jtds, and java_import's some classes.
|
@@ -288,15 +298,22 @@ module Traject
|
|
288
298
|
# new record! Put old one on batch queue.
|
289
299
|
record_batch << record if record
|
290
300
|
|
301
|
+
# Any exceptions from background threads?
|
302
|
+
@thread_pool.raise_collected_exception!
|
303
|
+
|
291
304
|
# prepare and yield batch?
|
292
305
|
if (record_count % batch_size == 0)
|
293
|
-
|
294
|
-
record_batch
|
295
|
-
|
296
|
-
|
297
|
-
|
306
|
+
batch = record_batch
|
307
|
+
record_batch = []
|
308
|
+
|
309
|
+
@thread_pool.maybe_in_thread_pool(batch) do |batch|
|
310
|
+
enhance_batch!(extra_connection, batch)
|
311
|
+
batch.each do |r|
|
312
|
+
# set current_bib_id for error logging
|
313
|
+
current_bib_id = r['001'].value
|
314
|
+
yield r
|
315
|
+
end
|
298
316
|
end
|
299
|
-
record_batch.clear
|
300
317
|
end
|
301
318
|
|
302
319
|
# And start new record we've encountered.
|
@@ -372,6 +389,10 @@ module Traject
|
|
372
389
|
# last one
|
373
390
|
record_batch << record if record
|
374
391
|
|
392
|
+
logger.debug "HorizonReader: Waiting for threadpool work complete..."
|
393
|
+
@thread_pool.shutdown_and_wait
|
394
|
+
logger.debug "HorizonReader: threadpool work complete."
|
395
|
+
|
375
396
|
# yield last batch
|
376
397
|
enhance_batch!(extra_connection, record_batch)
|
377
398
|
record_batch.each do |r|
|
@@ -381,6 +402,8 @@ module Traject
|
|
381
402
|
end
|
382
403
|
record_batch.clear
|
383
404
|
|
405
|
+
|
406
|
+
|
384
407
|
rescue Exception => e
|
385
408
|
logger.fatal "HorizonReader, unexpected exception at bib id:#{current_bib_id}: #{Traject::Util.exception_to_log_message(e)}"
|
386
409
|
raise e
|
@@ -516,6 +539,9 @@ module Traject
|
|
516
539
|
# It might be higher performance to refactor to re-use the same prepared statement
|
517
540
|
# for each item/copy fetch... but appears to be no great way to do that in JDBC3
|
518
541
|
# where you need to parameterize "IN" values. JDBC4 has got it, but jTDS is just JDBC3.
|
542
|
+
#
|
543
|
+
# Also, we need to be thread-safe now, so maybe better to create one
|
544
|
+
# each time, rather than risk sharing between threads accidentally.
|
519
545
|
pstmt = conn.prepareStatement(sql);
|
520
546
|
rs = pstmt.executeQuery
|
521
547
|
|
@@ -598,6 +624,7 @@ module Traject
|
|
598
624
|
def self.default_settings
|
599
625
|
{
|
600
626
|
"horizon.batch_size" => 400,
|
627
|
+
"horizon.thread_pool" => 2,
|
601
628
|
|
602
629
|
"horizon.public_only" => true,
|
603
630
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject_horizon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.10.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: traject
|