traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -0,0 +1,154 @@
1
+ module Traject
2
+ # An abstraction wrapping a threadpool executor in some configuration choices
3
+ # and other apparatus.
4
+ #
5
+ # 1) Initialize with chosen pool size -- we create fixed size pools, where
6
+ # core and max sizes are the same.
7
+
8
+ # 2) If initialized with nil for threadcount, no thread pool will actually
9
+ # be created, and all threadpool-related methods become no-ops. We call this
10
+ # the nil/null threadpool. A non-nil threadpool requires jruby, but you can
11
+ # create a null Traject::ThreadPool.new(nil) under MRI without anything
12
+ # complaining.
13
+ #
14
+ # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
15
+ # execution -- if no threadpool configured your block will just be
16
+ # executed in calling thread. Be careful to not refer to any non-local
17
+ # variables in the block, unless the variable has an object you can
18
+ # use thread-safely!
19
+ #
20
+ # 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
21
+ # with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
22
+ # the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
23
+ # meaning the block will end up executing in caller's own thread. With the kind
24
+ # of work we're doing, where each unit of work is small and there are many of them--
25
+ # the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
26
+ # the work queue from getting too large and exhausting memory, when producers are
27
+ # faster than consumers.
28
+ #
29
+ # 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
30
+ # manner, and can be re-raised in the thread of your choice by calling
31
+ # #raise_collected_exception!
32
+ #
33
+ # 6) When you are done with the threadpool, you can and must call
34
+ # #shutdown_and_wait, which will wait for all current queued work
35
+ # to complete, then return. You can not give any more work to the pool
36
+ # after you do this. By default it'll wait pretty much forever, which should
37
+ # be fine. If you never call shutdown, the pool will keep running forever
38
+ # and not allow your program to exit!
39
+ #
40
+ # 7) We will keep track of total times a block is run in thread pool, and
41
+ # total elapsed (wall) time of running all blocks, so an average_execution_ms
42
+ # time can be given. #average_execution_ms may be inaccurate if called when
43
+ # threads are still executing, as it's not entirely thread safe (may get
44
+ # an off by one as to total iterations)
45
+ class ThreadPool
46
+ attr_reader :pool_size, :label, :queue_capacity
47
+
48
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool
49
+ def initialize(pool_size)
50
+ unless pool_size.nil? || pool_size == 0
51
+ require 'java' # trigger an exception now if we're not jruby
52
+
53
+ @label = label
54
+
55
+ @pool_size = pool_size.to_i # just for reflection, we don't really need it again
56
+ @queue_capacity = pool_size * 3
57
+
58
+
59
+ blockingQueue = java.util.concurrent.ArrayBlockingQueue.new(@queue_capacity)
60
+ rejectedExecutionHandler = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
61
+
62
+ # keepalive times don't matter, we are setting core and max pool to
63
+ # same thing, fixed size pool.
64
+ @thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
65
+ @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
66
+ blockingQueue, rejectedExecutionHandler)
67
+
68
+ # A thread-safe queue to collect exceptions cross-threads.
69
+ # We make it small, we really only need to store the first
70
+ # exception, we don't care too much about others. But we'll
71
+ # keep the first 20, why not.
72
+ @async_exception_queue = java.util.concurrent.ArrayBlockingQueue.new(20)
73
+ end
74
+ end
75
+
76
+ # Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
77
+ # gets executed in the calling thread.
78
+ #
79
+ # There are actually two 'maybes':
80
+ #
81
+ # * If Traject::ThreadPool was configured with null thread pool, then ALL
82
+ # work will be executed in calling thread.
83
+ #
84
+ # * If there is a thread pool, but it's work queue is full, then a job
85
+ # will be executed in calling thread (because we configured our java
86
+ # thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
87
+ def maybe_in_thread_pool
88
+ start_t = Time.now
89
+
90
+ if @thread_pool
91
+
92
+ @thread_pool.execute do
93
+ begin
94
+ yield
95
+ rescue Exception => e
96
+ collect_exception(e)
97
+ end
98
+ end
99
+ else
100
+ yield
101
+ end
102
+
103
+ end
104
+
105
+ # Just for monitoring/debugging purposes, we'll return the work queue
106
+ # used by the threadpool. Don't recommend you do anything with it, as
107
+ # the original java.util.concurrent docs make the same recommendation.
108
+ def queue
109
+ @thread_pool && @thread_pool.queue
110
+ end
111
+
112
+ # thread-safe way of storing an exception, to raise
113
+ # later in a different thread. We don't guarantee
114
+ # that we can store more than one at a time, only
115
+ # the first one recorded may be stored.
116
+ def collect_exception(e)
117
+ # offer will silently do nothing if the queue is full, that's fine
118
+ # with us.
119
+ @async_exception_queue.offer(e)
120
+ end
121
+
122
+ # If there's a stored collected exception, raise it
123
+ # again now. Call this to re-raise exceptions caught in
124
+ # other threads in the thread of your choice.
125
+ #
126
+ # If you call this method on a ThreadPool initialized with nil
127
+ # as a non-functioning threadpool -- then this method is just
128
+ # a no-op.
129
+ def raise_collected_exception!
130
+ if @async_exception_queue && e = @async_exception_queue.poll
131
+ raise e
132
+ end
133
+ end
134
+
135
+ # shutdown threadpool, and wait for all work to complete.
136
+ # this one is also a no-op if you have a null ThreadPool that
137
+ # doesn't really have a threadpool at all.
138
+ #
139
+ # returns elapsed time in seconds it took to shutdown
140
+ def shutdown_and_wait
141
+ start_t = Time.now
142
+
143
+ if @thread_pool
144
+ @thread_pool.shutdown
145
+ # We pretty much want to wait forever, although we need to give
146
+ # a timeout. Okay, one day!
147
+ @thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
148
+ end
149
+
150
+ return (Time.now - start_t)
151
+ end
152
+
153
+ end
154
+ end
@@ -13,17 +13,18 @@ module Traject
13
13
  # or array of strings.
14
14
  #
15
15
  # What makes it more useful than a stunted hash is it's ability to load
16
- # the hash definitions from configuration files, either pure ruby or
17
- # yaml.
16
+ # the hash definitions from configuration files, either pure ruby,
17
+ # yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
18
+ # no = or : in key names, no split lines.)
18
19
  #
19
20
  # TranslationMap.new("dir/some_file")
20
21
  #
21
22
  # Will look through the entire ruby $LOAD_PATH, for a translation_maps subdir
22
- # that contains either some_file.rb OR some_file.yaml
23
+ # that contains either some_file.rb OR some_file.yaml OR some_file.properties.
23
24
  # * Looks for "/translation_maps" subdir in load paths, so
24
25
  # for instance you can have a gem that keeps translation maps
25
26
  # in ./lib/translation_maps, and it Just Works.
26
- # * Note you do NOT supply the ".rb" or ".yaml" suffix yourself,
27
+ # * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
27
28
  # it'll use whichever it finds (allows calling code to not care which is used).
28
29
  #
29
30
  # Ruby files just need to have their last line eval to a hash. They file
@@ -55,6 +56,8 @@ module Traject
55
56
  # When used with the #translate_array! method, one string can be replaced by multiple values
56
57
  # (array of strings) or removed (nil)
57
58
  #
59
+ # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
60
+ #
58
61
  # == Caching
59
62
  # Lookup and loading of configuration files will be cached, for efficiency.
60
63
  # You can reset with `TranslationMap.reset_cache!`
@@ -92,12 +95,17 @@ module Traject
92
95
  $LOAD_PATH.each do |base|
93
96
  rb_file = File.join( base, "translation_maps", "#{path}.rb" )
94
97
  yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
98
+ prop_file = File.join(base, "translation_maps", "#{path}.properties" )
95
99
 
96
100
  if File.exists? rb_file
97
101
  found = eval( File.open(rb_file).read , binding, rb_file )
98
102
  break
99
103
  elsif File.exists? yaml_file
100
104
  found = YAML.load_file(yaml_file)
105
+ break
106
+ elsif File.exists? prop_file
107
+ found = Traject::TranslationMap.read_properties(prop_file)
108
+ break
101
109
  end
102
110
  end
103
111
 
@@ -180,5 +188,39 @@ module Traject
180
188
  end
181
189
  end
182
190
 
191
+ protected
192
+
193
+ # No built-in way to read java-style .properties, we hack it.
194
+ # inspired by various hacky things found google ruby java properties parse
195
+ # .properties spec seems to be:
196
+ # http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
197
+ #
198
+ # We do NOT handle split lines, don't do that!
199
+ def self.read_properties(file_name)
200
+ hash = {}
201
+ i = 0
202
+ f = File.open(file_name)
203
+ f.each_line do |line|
204
+ i += 1
205
+
206
+ line.strip!
207
+
208
+ # skip blank lines
209
+ next if line.empty?
210
+
211
+ # skip comment lines
212
+ next if line =~ /^\s*[!\#].*$/
213
+
214
+ if line =~ /\A([^:=]+)[\:\=]\s*(.*)\s*\Z/
215
+ hash[$1.strip] = $2
216
+ else
217
+ raise IOError.new("Can't parse from #{file_name} line #{i}: #{line}")
218
+ end
219
+ end
220
+ f.close
221
+
222
+ return hash
223
+ end
224
+
183
225
  end
184
226
  end
@@ -0,0 +1,30 @@
1
+ require 'traject'
2
+
3
+ module Traject
4
+ # Just some internal utility methods
5
+ module Util
6
+
7
+ def self.exception_to_log_message(e)
8
+ indent = " "
9
+
10
+ msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
11
+ msg += indent + e.backtrace.first + "\n"
12
+
13
+ if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause )
14
+ caused_by = e.getRootCause
15
+ msg += indent + "Caused by\n"
16
+ msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
17
+ msg += indent + caused_by.backtrace.first + "\n"
18
+ end
19
+
20
+ return msg
21
+ end
22
+
23
+ # From ruby #caller method, you get an array. Pass one line
24
+ # of the array here, get just file and line number out.
25
+ def self.extract_caller_location(str)
26
+ str.split(':in `').first
27
+ end
28
+
29
+ end
30
+ end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.0.2"
2
+ VERSION = "0.9.1"
3
3
  end
@@ -0,0 +1,26 @@
1
+ # Very crude basic mapping from top-level first-letter of LCC call
2
+ # number to an area name.
3
+
4
+ A: General Works
5
+ B: Philosophy, Psychology, Religion
6
+ C: Historical Sciences (Archaeology, Genealogy)
7
+ D: World History
8
+ E: History of the Americas (General)
9
+ F: History of the Americas (Local)
10
+ G: Geography, Anthropology, Recreation
11
+ H: Social Sciences
12
+ J: Political Science
13
+ K: Law
14
+ L: Education
15
+ M: Music
16
+ N: Fine Arts
17
+ P: Language & Literature
18
+ Q: Science
19
+ R: Medicine
20
+ S: Agriculture
21
+ T: Technology
22
+ U: Military Science
23
+ V: Naval Science
24
+ Z: Bibliography, Library Science, Information Resources
25
+ # W is NLM call number
26
+ W: Medicine
@@ -0,0 +1,9 @@
1
+ # attempted mapping from 007 byte 1 to format/genres
2
+
3
+ 'a': Map/Globe
4
+ 'd': Map/Globe
5
+ 'k': Image
6
+ 'q': Musical Score
7
+ 'r': Image
8
+ 'v': Video/Film
9
+ 'm': Video/Film
@@ -0,0 +1,22 @@
1
+ # First try to map leader bytes 6+7 combined, then if nothing
2
+ # try to map just leader byte 6.
3
+
4
+ # first some mappings from leader bytes 6 and 7 combined:
5
+
6
+ 'aa': Book
7
+ 'ab': Journal/Newspaper
8
+ 'am': Book
9
+ 'as': Journal/Newspaper
10
+ 'ta': Book
11
+ 'tm': Book
12
+
13
+ # Now some from just byte 6
14
+ 'c': Musical Score
15
+ 'd': Musical Score
16
+ 'e': Map/Globe
17
+ 'f': Map/Globe
18
+ 'i': Non-musical Recording
19
+ 'j': Musical Recording
20
+ 'k': Image
21
+ 'm': Software/Data
22
+ 'g': Video/Film