traject 0.0.2 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/README.md +85 -61
- data/Rakefile +5 -0
- data/bin/traject +31 -3
- data/doc/settings.md +74 -13
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject/indexer/settings.rb +75 -0
- data/lib/traject/indexer.rb +255 -45
- data/lib/traject/json_writer.rb +4 -2
- data/lib/traject/macros/marc21.rb +18 -6
- data/lib/traject/macros/marc21_semantics.rb +405 -0
- data/lib/traject/macros/marc_format_classifier.rb +180 -0
- data/lib/traject/marc4j_reader.rb +160 -0
- data/lib/traject/marc_extractor.rb +33 -17
- data/lib/traject/marc_reader.rb +14 -11
- data/lib/traject/solrj_writer.rb +247 -9
- data/lib/traject/thread_pool.rb +154 -0
- data/lib/traject/translation_map.rb +46 -4
- data/lib/traject/util.rb +30 -0
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/indexer/each_record_test.rb +34 -0
- data/test/indexer/macros_marc21_semantics_test.rb +206 -0
- data/test/indexer/macros_marc21_test.rb +10 -1
- data/test/indexer/map_record_test.rb +78 -8
- data/test/indexer/read_write_test.rb +43 -10
- data/test/indexer/settings_test.rb +60 -4
- data/test/indexer/to_field_test.rb +39 -0
- data/test/marc4j_reader_test.rb +75 -0
- data/test/marc_extractor_test.rb +62 -0
- data/test/marc_format_classifier_test.rb +91 -0
- data/test/marc_reader_test.rb +12 -0
- data/test/solrj_writer_test.rb +146 -43
- data/test/test_helper.rb +50 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +153 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +8 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/traject.gemspec +1 -1
- data/vendor/marc4j/README.md +17 -0
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
- metadata +81 -2
@@ -0,0 +1,154 @@
|
|
1
|
+
module Traject
|
2
|
+
# An abstraction wrapping a threadpool executor in some configuration choices
|
3
|
+
# and other apparatus.
|
4
|
+
#
|
5
|
+
# 1) Initialize with chosen pool size -- we create fixed size pools, where
|
6
|
+
# core and max sizes are the same.
|
7
|
+
|
8
|
+
# 2) If initialized with nil for threadcount, no thread pool will actually
|
9
|
+
# be created, and all threadpool-related methods become no-ops. We call this
|
10
|
+
# the nil/null threadpool. A non-nil threadpool requires jruby, but you can
|
11
|
+
# create a null Traject::ThreadPool.new(nil) under MRI without anything
|
12
|
+
# complaining.
|
13
|
+
#
|
14
|
+
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
15
|
+
# execution -- if no threadpool configured your block will just be
|
16
|
+
# executed in calling thread. Be careful to not refer to any non-local
|
17
|
+
# variables in the block, unless the variable has an object you can
|
18
|
+
# use thread-safely!
|
19
|
+
#
|
20
|
+
# 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
|
21
|
+
# with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
|
22
|
+
# the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
|
23
|
+
# meaning the block will end up executing in caller's own thread. With the kind
|
24
|
+
# of work we're doing, where each unit of work is small and there are many of them--
|
25
|
+
# the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
|
26
|
+
# the work queue from getting too large and exhausting memory, when producers are
|
27
|
+
# faster than consumers.
|
28
|
+
#
|
29
|
+
# 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
|
30
|
+
# manner, and can be re-raised in the thread of your choice by calling
|
31
|
+
# #raise_collected_exception!
|
32
|
+
#
|
33
|
+
# 6) When you are done with the threadpool, you can and must call
|
34
|
+
# #shutdown_and_wait, which will wait for all current queued work
|
35
|
+
# to complete, then return. You can not give any more work to the pool
|
36
|
+
# after you do this. By default it'll wait pretty much forever, which should
|
37
|
+
# be fine. If you never call shutdown, the pool will keep running forever
|
38
|
+
# and not allow your program to exit!
|
39
|
+
#
|
40
|
+
# 7) We will keep track of total times a block is run in thread pool, and
|
41
|
+
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
42
|
+
# time can be given. #average_execution_ms may be inaccurate if called when
|
43
|
+
# threads are still executing, as it's not entirely thread safe (may get
|
44
|
+
# an off by one as to total iterations)
|
45
|
+
class ThreadPool
|
46
|
+
attr_reader :pool_size, :label, :queue_capacity
|
47
|
+
|
48
|
+
# First arg is pool size, 0 or nil and we'll be a null/no-op pool
|
49
|
+
def initialize(pool_size)
|
50
|
+
unless pool_size.nil? || pool_size == 0
|
51
|
+
require 'java' # trigger an exception now if we're not jruby
|
52
|
+
|
53
|
+
@label = label
|
54
|
+
|
55
|
+
@pool_size = pool_size.to_i # just for reflection, we don't really need it again
|
56
|
+
@queue_capacity = pool_size * 3
|
57
|
+
|
58
|
+
|
59
|
+
blockingQueue = java.util.concurrent.ArrayBlockingQueue.new(@queue_capacity)
|
60
|
+
rejectedExecutionHandler = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
|
61
|
+
|
62
|
+
# keepalive times don't matter, we are setting core and max pool to
|
63
|
+
# same thing, fixed size pool.
|
64
|
+
@thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
|
65
|
+
@pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
|
66
|
+
blockingQueue, rejectedExecutionHandler)
|
67
|
+
|
68
|
+
# A thread-safe queue to collect exceptions cross-threads.
|
69
|
+
# We make it small, we really only need to store the first
|
70
|
+
# exception, we don't care too much about others. But we'll
|
71
|
+
# keep the first 20, why not.
|
72
|
+
@async_exception_queue = java.util.concurrent.ArrayBlockingQueue.new(20)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
|
77
|
+
# gets executed in the calling thread.
|
78
|
+
#
|
79
|
+
# There are actually two 'maybes':
|
80
|
+
#
|
81
|
+
# * If Traject::ThreadPool was configured with null thread pool, then ALL
|
82
|
+
# work will be executed in calling thread.
|
83
|
+
#
|
84
|
+
# * If there is a thread pool, but it's work queue is full, then a job
|
85
|
+
# will be executed in calling thread (because we configured our java
|
86
|
+
# thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
|
87
|
+
def maybe_in_thread_pool
|
88
|
+
start_t = Time.now
|
89
|
+
|
90
|
+
if @thread_pool
|
91
|
+
|
92
|
+
@thread_pool.execute do
|
93
|
+
begin
|
94
|
+
yield
|
95
|
+
rescue Exception => e
|
96
|
+
collect_exception(e)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
else
|
100
|
+
yield
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
# Just for monitoring/debugging purposes, we'll return the work queue
|
106
|
+
# used by the threadpool. Don't recommend you do anything with it, as
|
107
|
+
# the original java.util.concurrent docs make the same recommendation.
|
108
|
+
def queue
|
109
|
+
@thread_pool && @thread_pool.queue
|
110
|
+
end
|
111
|
+
|
112
|
+
# thread-safe way of storing an exception, to raise
|
113
|
+
# later in a different thread. We don't guarantee
|
114
|
+
# that we can store more than one at a time, only
|
115
|
+
# the first one recorded may be stored.
|
116
|
+
def collect_exception(e)
|
117
|
+
# offer will silently do nothing if the queue is full, that's fine
|
118
|
+
# with us.
|
119
|
+
@async_exception_queue.offer(e)
|
120
|
+
end
|
121
|
+
|
122
|
+
# If there's a stored collected exception, raise it
|
123
|
+
# again now. Call this to re-raise exceptions caught in
|
124
|
+
# other threads in the thread of your choice.
|
125
|
+
#
|
126
|
+
# If you call this method on a ThreadPool initialized with nil
|
127
|
+
# as a non-functioning threadpool -- then this method is just
|
128
|
+
# a no-op.
|
129
|
+
def raise_collected_exception!
|
130
|
+
if @async_exception_queue && e = @async_exception_queue.poll
|
131
|
+
raise e
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# shutdown threadpool, and wait for all work to complete.
|
136
|
+
# this one is also a no-op if you have a null ThreadPool that
|
137
|
+
# doesn't really have a threadpool at all.
|
138
|
+
#
|
139
|
+
# returns elapsed time in seconds it took to shutdown
|
140
|
+
def shutdown_and_wait
|
141
|
+
start_t = Time.now
|
142
|
+
|
143
|
+
if @thread_pool
|
144
|
+
@thread_pool.shutdown
|
145
|
+
# We pretty much want to wait forever, although we need to give
|
146
|
+
# a timeout. Okay, one day!
|
147
|
+
@thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
|
148
|
+
end
|
149
|
+
|
150
|
+
return (Time.now - start_t)
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
end
|
@@ -13,17 +13,18 @@ module Traject
|
|
13
13
|
# or array of strings.
|
14
14
|
#
|
15
15
|
# What makes it more useful than a stunted hash is it's ability to load
|
16
|
-
# the hash definitions from configuration files, either pure ruby
|
17
|
-
# yaml.
|
16
|
+
# the hash definitions from configuration files, either pure ruby,
|
17
|
+
# yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
|
18
|
+
# no = or : in key names, no split lines.)
|
18
19
|
#
|
19
20
|
# TranslationMap.new("dir/some_file")
|
20
21
|
#
|
21
22
|
# Will look through the entire ruby $LOAD_PATH, for a translation_maps subdir
|
22
|
-
# that contains either some_file.rb OR some_file.yaml
|
23
|
+
# that contains either some_file.rb OR some_file.yaml OR some_file.properties.
|
23
24
|
# * Looks for "/translation_maps" subdir in load paths, so
|
24
25
|
# for instance you can have a gem that keeps translation maps
|
25
26
|
# in ./lib/translation_maps, and it Just Works.
|
26
|
-
# * Note you do NOT supply the
|
27
|
+
# * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
|
27
28
|
# it'll use whichever it finds (allows calling code to not care which is used).
|
28
29
|
#
|
29
30
|
# Ruby files just need to have their last line eval to a hash. They file
|
@@ -55,6 +56,8 @@ module Traject
|
|
55
56
|
# When used with the #translate_array! method, one string can be replaced by multiple values
|
56
57
|
# (array of strings) or removed (nil)
|
57
58
|
#
|
59
|
+
# There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
|
60
|
+
#
|
58
61
|
# == Caching
|
59
62
|
# Lookup and loading of configuration files will be cached, for efficiency.
|
60
63
|
# You can reset with `TranslationMap.reset_cache!`
|
@@ -92,12 +95,17 @@ module Traject
|
|
92
95
|
$LOAD_PATH.each do |base|
|
93
96
|
rb_file = File.join( base, "translation_maps", "#{path}.rb" )
|
94
97
|
yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
|
98
|
+
prop_file = File.join(base, "translation_maps", "#{path}.properties" )
|
95
99
|
|
96
100
|
if File.exists? rb_file
|
97
101
|
found = eval( File.open(rb_file).read , binding, rb_file )
|
98
102
|
break
|
99
103
|
elsif File.exists? yaml_file
|
100
104
|
found = YAML.load_file(yaml_file)
|
105
|
+
break
|
106
|
+
elsif File.exists? prop_file
|
107
|
+
found = Traject::TranslationMap.read_properties(prop_file)
|
108
|
+
break
|
101
109
|
end
|
102
110
|
end
|
103
111
|
|
@@ -180,5 +188,39 @@ module Traject
|
|
180
188
|
end
|
181
189
|
end
|
182
190
|
|
191
|
+
protected
|
192
|
+
|
193
|
+
# No built-in way to read java-style .properties, we hack it.
|
194
|
+
# inspired by various hacky things found google ruby java properties parse
|
195
|
+
# .properties spec seems to be:
|
196
|
+
# http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
|
197
|
+
#
|
198
|
+
# We do NOT handle split lines, don't do that!
|
199
|
+
def self.read_properties(file_name)
|
200
|
+
hash = {}
|
201
|
+
i = 0
|
202
|
+
f = File.open(file_name)
|
203
|
+
f.each_line do |line|
|
204
|
+
i += 1
|
205
|
+
|
206
|
+
line.strip!
|
207
|
+
|
208
|
+
# skip blank lines
|
209
|
+
next if line.empty?
|
210
|
+
|
211
|
+
# skip comment lines
|
212
|
+
next if line =~ /^\s*[!\#].*$/
|
213
|
+
|
214
|
+
if line =~ /\A([^:=]+)[\:\=]\s*(.*)\s*\Z/
|
215
|
+
hash[$1.strip] = $2
|
216
|
+
else
|
217
|
+
raise IOError.new("Can't parse from #{file_name} line #{i}: #{line}")
|
218
|
+
end
|
219
|
+
end
|
220
|
+
f.close
|
221
|
+
|
222
|
+
return hash
|
223
|
+
end
|
224
|
+
|
183
225
|
end
|
184
226
|
end
|
data/lib/traject/util.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'traject'
|
2
|
+
|
3
|
+
module Traject
|
4
|
+
# Just some internal utility methods
|
5
|
+
module Util
|
6
|
+
|
7
|
+
def self.exception_to_log_message(e)
|
8
|
+
indent = " "
|
9
|
+
|
10
|
+
msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
|
11
|
+
msg += indent + e.backtrace.first + "\n"
|
12
|
+
|
13
|
+
if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause )
|
14
|
+
caused_by = e.getRootCause
|
15
|
+
msg += indent + "Caused by\n"
|
16
|
+
msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
|
17
|
+
msg += indent + caused_by.backtrace.first + "\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
return msg
|
21
|
+
end
|
22
|
+
|
23
|
+
# From ruby #caller method, you get an array. Pass one line
|
24
|
+
# of the array here, get just file and line number out.
|
25
|
+
def self.extract_caller_location(str)
|
26
|
+
str.split(':in `').first
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
data/lib/traject/version.rb
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
# Very crude basic mapping from top-level first-letter of LCC call
|
2
|
+
# number to an area name.
|
3
|
+
|
4
|
+
A: General Works
|
5
|
+
B: Philosophy, Psychology, Religion
|
6
|
+
C: Historical Sciences (Archaeology, Genealogy)
|
7
|
+
D: World History
|
8
|
+
E: History of the Americas (General)
|
9
|
+
F: History of the Americas (Local)
|
10
|
+
G: Geography, Anthropology, Recreation
|
11
|
+
H: Social Sciences
|
12
|
+
J: Political Science
|
13
|
+
K: Law
|
14
|
+
L: Education
|
15
|
+
M: Music
|
16
|
+
N: Fine Arts
|
17
|
+
P: Language & Literature
|
18
|
+
Q: Science
|
19
|
+
R: Medicine
|
20
|
+
S: Agriculture
|
21
|
+
T: Technology
|
22
|
+
U: Military Science
|
23
|
+
V: Naval Science
|
24
|
+
Z: Bibliography, Library Science, Information Resources
|
25
|
+
# W is NLM call number
|
26
|
+
W: Medicine
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# First try to map leader bytes 6+7 combined, then if nothing
|
2
|
+
# try to map just leader byte 6.
|
3
|
+
|
4
|
+
# first some mappings from leader bytes 6 and 7 combined:
|
5
|
+
|
6
|
+
'aa': Book
|
7
|
+
'ab': Journal/Newspaper
|
8
|
+
'am': Book
|
9
|
+
'as': Journal/Newspaper
|
10
|
+
'ta': Book
|
11
|
+
'tm': Book
|
12
|
+
|
13
|
+
# Now some from just byte 6
|
14
|
+
'c': Musical Score
|
15
|
+
'd': Musical Score
|
16
|
+
'e': Map/Globe
|
17
|
+
'f': Map/Globe
|
18
|
+
'i': Non-musical Recording
|
19
|
+
'j': Musical Recording
|
20
|
+
'k': Image
|
21
|
+
'm': Software/Data
|
22
|
+
'g': Video/Film
|