traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,161 @@
1
+ require 'concurrent'
2
+ require 'thread' # for Queue
3
+
4
+ module Traject
5
+ # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
6
+ # and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
7
+ # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
8
+ #
9
+ # 1) Initialize with chosen pool size -- we create fixed size pools, where
10
+ # core and max sizes are the same.
11
+ #
12
+ # 2) If initialized with nil or 0 for threadcount, no thread pool will actually
13
+ # be created, and work sent to the Traject::ThreadPool will just be executed
14
+ # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
+ # is if you are running under MRI, where multi-core parallelism isn't available, so
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
+ # especially one with size 1, can be useful in MRI for I/O blocking operations)
18
+ #
19
+ # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
20
+ # execution -- if configurred with a nil threadcount, your block will just be
21
+ # executed in calling thread. Be careful to not refer to any non-local
22
+ # variables in the block, unless the variable has an object you can
23
+ # use thread-safely!
24
+ #
25
+ # 4) We configure our underlying Concurrent::ThreadPool
26
+ # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
27
+ # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
28
+ # meaning the block will end up executing in caller's own thread. With the kind
29
+ # of work we're doing, where each unit of work is small and there are many of them--
30
+ # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
31
+ # the work queue from getting too large and exhausting memory, when producers are
32
+ # faster than consumers.
33
+ #
34
+ # 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
35
+ # manner, and can be re-raised in the thread of your choice by calling
36
+ # #raise_collected_exception!
37
+ #
38
+ # 6) When you are done with the threadpool, you can and must call
39
+ # #shutdown_and_wait, which will wait for all current queued work
40
+ # to complete, then return. You can not give any more work to the pool
41
+ # after you do this. By default it'll wait pretty much forever, which should
42
+ # be fine. If you never call shutdown, then queued or in-progress work
43
+ # may be abandoned when the program ends, which would be bad.
44
+ #
45
+ # 7) We will keep track of total times a block is run in thread pool, and
46
+ # total elapsed (wall) time of running all blocks, so an average_execution_ms
47
+ # time can be given. #average_execution_ms may be inaccurate if called when
48
+ # threads are still executing, as it's not entirely thread safe (may get
49
+ # an off by one as to total iterations)
50
+ class ThreadPool
51
+ attr_reader :pool_size, :queue_capacity
52
+
53
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
+ # work in caller thread.
55
+ def initialize(pool_size)
56
+ unless pool_size.nil? || pool_size == 0
57
+ @pool_size = pool_size.to_i
58
+ @queue_capacity = pool_size * 3
59
+
60
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
61
+ :min_threads => @pool_size,
62
+ :max_threads => @pool_size,
63
+ :max_queue => @queue_capacity,
64
+ :fallback_policy => :caller_runs
65
+ )
66
+
67
+ # A thread-safe queue to collect exceptions cross-threads.
68
+ # We really only need to save the first exception, but a queue
69
+ # is a convenient way to store a value concurrency-safely, and
70
+ # might as well store all of them.
71
+ @exceptions_caught_queue = Queue.new
72
+ end
73
+ end
74
+
75
+ # Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
76
+ # gets executed in the calling thread.
77
+ #
78
+ # There are actually two 'maybes':
79
+ #
80
+ # * If Traject::ThreadPool was configured with null thread pool, then ALL
81
+ # work will be executed in calling thread.
82
+ #
83
+ # * If there is a thread pool, but it's work queue is full, then a job
84
+ # will be executed in calling thread (because we configured our java
85
+ # thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
86
+ #
87
+ # You can pass arbitrary arguments to the method, that will then be passed
88
+ # to your block -- similar to how ruby Thread.new works. This is convenient
89
+ # for creating variables unique to the block that won't be shared outside
90
+ # the thread:
91
+ #
92
+ # thread_pool.maybe_in_thread_pool(x, y) do |x1, y1|
93
+ # 100.times do
94
+ # something_with(x1)
95
+ # end
96
+ # end
97
+ # x = "someting else"
98
+ # # If we hadn't passed args with block, and had just
99
+ # # used x in the block, it'd be the SAME x as this one,
100
+ # # and would be pointing to a different string now!
101
+ #
102
+ # Note, that just makes block-local variables, it doesn't
103
+ # help you with whether a data structure itself is thread safe.
104
+ def maybe_in_thread_pool(*args)
105
+ start_t = Time.now
106
+
107
+ if @thread_pool
108
+ @thread_pool.post do
109
+ begin
110
+ yield(*args)
111
+ rescue Exception => e
112
+ collect_exception(e)
113
+ end
114
+ end
115
+ else
116
+ yield(*args)
117
+ end
118
+
119
+ end
120
+
121
+
122
+ # thread-safe way of storing an exception, to raise
123
+ # later in a different thread. We don't guarantee
124
+ # that we can store more than one at a time, only
125
+ # the first one recorded may be stored.
126
+ def collect_exception(e)
127
+ @exceptions_caught_queue.push(e)
128
+ end
129
+
130
+ # If there's a stored collected exception, raise it
131
+ # again now. Call this to re-raise exceptions caught in
132
+ # other threads in the thread of your choice.
133
+ #
134
+ # If you call this method on a ThreadPool initialized with nil
135
+ # as a non-functioning threadpool -- then this method is just
136
+ # a no-op.
137
+ def raise_collected_exception!
138
+ if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
139
+ e = @exceptions_caught_queue.pop
140
+ raise e
141
+ end
142
+ end
143
+
144
+ # shutdown threadpool, and wait for all work to complete.
145
+ # this one is also a no-op if you have a null ThreadPool that
146
+ # doesn't really have a threadpool at all.
147
+ #
148
+ # returns elapsed time in seconds it took to shutdown
149
+ def shutdown_and_wait
150
+ start_t = Time.now
151
+
152
+ if @thread_pool
153
+ @thread_pool.shutdown
154
+ @thread_pool.wait_for_termination
155
+ end
156
+
157
+ return (Time.now - start_t)
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,267 @@
1
+ require 'traject'
2
+
3
+ require 'yaml'
4
+ require 'dot-properties'
5
+
6
+
7
+ module Traject
8
+ # A TranslationMap is basically just something that has a hash-like #[]
9
+ # method to map from input strings to output strings:
10
+ #
11
+ # translation_map["some_input"] #=> some_output
12
+ #
13
+ # Input is assumed to always be string, output is either string
14
+ # or array of strings.
15
+ #
16
+ # What makes it more useful than a stunted hash is it's ability to load
17
+ # the hash definitions from configuration files, either pure ruby,
18
+ # yaml, or java .properties file (not all .properties features may
19
+ # be supported, we use dot-properties gem for reading)
20
+ #
21
+ # traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
22
+ # that will automatically find and use a translation map on the resulting data:
23
+ #
24
+ # extract_marc("040a", :translation_map => "languages")
25
+ #
26
+ # Or you can always create one yourself and use it how you like:
27
+ #
28
+ # map = TranslationMap.new("languages")
29
+ #
30
+ # In either case, TranslationMap will look for a file named, in that example,
31
+ # `languages.rb` or `languages.yaml` or `languages.properties`,
32
+ # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
33
+ #
34
+ # * Also looks for "/translation_maps" subdir in load paths, so
35
+ # for instance you can have a gem that keeps translation maps
36
+ # in ./lib/translation_maps, and it Just Works.
37
+ #
38
+ # * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
39
+ # it'll use whichever it finds (allows calling code to not care which is used).
40
+ #
41
+ # Ruby files just need to have their last line eval to a hash. They file
42
+ # will be run through `eval`, don't do it with untrusted content (naturally)
43
+ #
44
+ # You can also pass in a Hash for consistency to TranslationMap.new, although
45
+ # I don't know why you'd want to.
46
+ #
47
+ # ## Special default handling
48
+ #
49
+ # The key "__default__" in the hash is treated specially. If set to a string,
50
+ # that string will be returned by the TranslationMap for any input not otherwise
51
+ # included. If set to the special string "__passthrough__", then for input not
52
+ # mapped, the original input string will be returned.
53
+ #
54
+ # This is most useful for YAML definition files, if you are using an actual ruby
55
+ # hash, you could just set the hash to do what you want using Hash#default_proc
56
+ # etc.
57
+ #
58
+ # Or, when calling TranslationMap.new(), you can pass in options over-riding special
59
+ # key too:
60
+ #
61
+ # TranslationMap.new("something", :default => "foo")
62
+ # TranslationMap.new("something", :default => :passthrough)
63
+ #
64
+ # ## Output: String or array of strings
65
+ #
66
+ # The output can be a string or an array of strings, or nil. It should not be anything else.
67
+ # When used with the #translate_array! method, one string can be replaced by multiple values
68
+ # (array of strings) or removed (nil)
69
+ #
70
+ # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
71
+ #
72
+ # ## Caching
73
+ #
74
+ # Lookup and loading of configuration files will be cached, for efficiency.
75
+ # You can reset with `TranslationMap.reset_cache!`
76
+ #
77
+ # ## YAML example:
78
+ #
79
+ # key: value
80
+ # key2: value2 multiple words fine
81
+ # key2b: "Although you can use quotes if you want: Or need."
82
+ # key3:
83
+ # - array
84
+ # - of
85
+ # - values look like this
86
+ #
87
+ # ## Alternatives
88
+ # `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
89
+ # simple key-value stores with optional default values.
90
+ #
91
+ # If you need more complex translation, you can simply use `#map!`
92
+ # or its kin to work on the `accumulator` in a block
93
+ #
94
+ #
95
+ #
96
+ # # get a lousy language detection of any vernacular title
97
+ # require 'whatlanguage'
98
+ # wl = WhatLanguage.new(:all)
99
+ # to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
100
+ # # accumulator is already filled with the values of any 880s that reference a 245 because
101
+ # # of the call to #extract_marc
102
+ # acc.map! {|x| wl.language(x) }
103
+ # acc.uniq!
104
+ # end
105
+ # Within the block, you may also be interested in using:
106
+ # * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
107
+ # * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
108
+ class TranslationMap
109
+ class Cache
110
+ def initialize
111
+ @cached = Hash.new
112
+ end
113
+
114
+ # Returns an actual Hash -- or nil if none found.
115
+ def lookup(path)
116
+ unless @cached.has_key?(path)
117
+ @cached[path] = _lookup!(path)
118
+ end
119
+ return @cached[path]
120
+ end
121
+
122
+ # force lookup, without using cache.
123
+ # used by cache. Returns the actual hash.
124
+ # Returns nil if none found.
125
+ # May raise on syntax error in file being loaded.
126
+ def _lookup!(path)
127
+ found = nil
128
+
129
+ $LOAD_PATH.each do |base|
130
+ rb_file = File.join( base, "translation_maps", "#{path}.rb" )
131
+ yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
132
+ prop_file = File.join(base, "translation_maps", "#{path}.properties" )
133
+
134
+ if File.exists? rb_file
135
+ found = eval( File.open(rb_file).read , binding, rb_file )
136
+ break
137
+ elsif File.exists? yaml_file
138
+ found = YAML.load_file(yaml_file)
139
+ break
140
+ elsif File.exists? prop_file
141
+ found = Traject::TranslationMap.read_properties(prop_file)
142
+ break
143
+ end
144
+ end
145
+
146
+ # Cached hash can't be mutated without weird consequences, let's
147
+ # freeze it!
148
+ found.freeze if found
149
+
150
+ return found
151
+ end
152
+
153
+ def reset_cache!
154
+ @cached.clear
155
+ end
156
+
157
+ end
158
+
159
+ attr_reader :hash
160
+ attr_reader :default
161
+
162
+ class << self
163
+ attr_accessor :cache
164
+ def reset_cache!
165
+ cache.reset_cache!
166
+ end
167
+ end
168
+ self.cache = Cache.new
169
+
170
+
171
+ def initialize(defn, options = {})
172
+ if defn.kind_of? Hash
173
+ @hash = defn
174
+ elsif defn.kind_of? self.class
175
+ @hash = defn.to_hash
176
+ @default = defn.default
177
+ else
178
+ @hash = self.class.cache.lookup(defn)
179
+ raise NotFound.new(defn) if @hash.nil?
180
+ end
181
+
182
+ if options[:default]
183
+ @default = options[:default]
184
+ elsif @hash.has_key? "__default__"
185
+ @default = @hash["__default__"]
186
+ end
187
+ end
188
+
189
+ def [](key)
190
+ if self.default && (! @hash.has_key?(key))
191
+ if self.default == "__passthrough__"
192
+ return key
193
+ else
194
+ return self.default
195
+ end
196
+ end
197
+
198
+ @hash[key]
199
+ end
200
+ alias_method :map, :[]
201
+
202
+ # Returns a dup of internal hash, dup so you can modify it
203
+ # if you like.
204
+ def to_hash
205
+ dup = @hash.dup
206
+ dup.delete("__default__")
207
+ dup
208
+ end
209
+
210
+ # Run every element of an array through this translation map,
211
+ # return the resulting array. If translation map returns nil,
212
+ # original element will be missing from output.
213
+ #
214
+ # If an input maps to an array, each element of the array will be flattened
215
+ # into the output.
216
+ #
217
+ # If an input maps to nil, it will cause the input element to be removed
218
+ # entirely.
219
+ def translate_array(array)
220
+ array.each_with_object([]) do |input_element, output_array|
221
+ output_element = self.map(input_element)
222
+ if output_element.kind_of? Array
223
+ output_array.concat output_element
224
+ elsif ! output_element.nil?
225
+ output_array << output_element
226
+ end
227
+ end
228
+ end
229
+
230
+ def translate_array!(array)
231
+ array.replace( self.translate_array(array))
232
+ end
233
+
234
+ # Return a new TranslationMap that results from merging argument on top of self.
235
+ # Can be useful for taking an existing translation map, but merging a few
236
+ # overrides on top.
237
+ #
238
+ # merged_map = TranslationMap.new(something).merge TranslationMap.new(else)
239
+ # #...
240
+ # merged_map.translate_array(something) # etc
241
+ #
242
+ # If a default is set in the second map, it will merge over the first too.
243
+ #
244
+ # You can also pass in a plain hash as an arg, instead of an existing TranslationMap:
245
+ #
246
+ # TranslationMap.new(something).merge("overridden_key" => "value", "a" => "")
247
+ def merge(other_map)
248
+ default = other_map.default || self.default
249
+ TranslationMap.new(self.to_hash.merge(other_map.to_hash), :default => default)
250
+ end
251
+
252
+ class NotFound < Exception
253
+ def initialize(path)
254
+ super("No translation map definition file found at 'translation_maps/#{path}.[rb|yaml|properties]' in load path: #{$LOAD_PATH}")
255
+ end
256
+ end
257
+
258
+ protected
259
+
260
+ # We use dot-properties gem for reading .properties files,
261
+ # return a hash.
262
+ def self.read_properties(file_name)
263
+ return DotProperties.load(file_name).to_h
264
+ end
265
+
266
+ end
267
+ end
@@ -0,0 +1,52 @@
1
+ require 'traject'
2
+
3
+ module Traject
4
+ # Just some internal utility methods
5
+ module Util
6
+
7
+ def self.exception_to_log_message(e)
8
+ indent = " "
9
+
10
+ msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
11
+ msg += indent + e.backtrace.first + "\n"
12
+
13
+ if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause )
14
+ caused_by = e.getRootCause
15
+ msg += indent + "Caused by\n"
16
+ msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
17
+ msg += indent + caused_by.backtrace.first + "\n"
18
+ end
19
+
20
+ return msg
21
+ end
22
+
23
+ # From ruby #caller method, you get an array. Pass one line
24
+ # of the array here, get just file and line number out.
25
+ def self.extract_caller_location(str)
26
+ str.split(':in `').first
27
+ end
28
+
29
+
30
+
31
+ # Ruby stdlib queue lacks a 'drain' function, we write one.
32
+ #
33
+ # Removes everything currently in the ruby stdlib queue, and returns
34
+ # it an array. Should be concurrent-safe, but queue may still have
35
+ # some things in it after drain, if there are concurrent writers.
36
+ def self.drain_queue(queue)
37
+ result = []
38
+
39
+ queue_size = queue.size
40
+ begin
41
+ queue_size.times do
42
+ result << queue.deq(:raise_if_empty)
43
+ end
44
+ rescue ThreadError
45
+ # Need do nothing, queue was concurrently popped, no biggie
46
+ end
47
+
48
+ return result
49
+ end
50
+
51
+ end
52
+ end