traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,161 @@
1
+ require 'concurrent'
2
+ require 'thread' # for Queue
3
+
4
+ module Traject
5
+ # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
6
+ # and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
7
+ # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
8
+ #
9
+ # 1) Initialize with chosen pool size -- we create fixed size pools, where
10
+ # core and max sizes are the same.
11
+ #
12
+ # 2) If initialized with nil or 0 for threadcount, no thread pool will actually
13
+ # be created, and work sent to the Traject::ThreadPool will just be executed
14
+ # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
+ # is if you are running under MRI, where multi-core parallelism isn't available, so
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
+ # especially one with size 1, can be useful in MRI for I/O blocking operations)
18
+ #
19
+ # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
20
+ # execution -- if configurred with a nil threadcount, your block will just be
21
+ # executed in calling thread. Be careful to not refer to any non-local
22
+ # variables in the block, unless the variable has an object you can
23
+ # use thread-safely!
24
+ #
25
+ # 4) We configure our underlying Concurrent::ThreadPool
26
+ # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
27
+ # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
28
+ # meaning the block will end up executing in caller's own thread. With the kind
29
+ # of work we're doing, where each unit of work is small and there are many of them--
30
+ # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
31
+ # the work queue from getting too large and exhausting memory, when producers are
32
+ # faster than consumers.
33
+ #
34
+ # 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
35
+ # manner, and can be re-raised in the thread of your choice by calling
36
+ # #raise_collected_exception!
37
+ #
38
+ # 6) When you are done with the threadpool, you can and must call
39
+ # #shutdown_and_wait, which will wait for all current queued work
40
+ # to complete, then return. You can not give any more work to the pool
41
+ # after you do this. By default it'll wait pretty much forever, which should
42
+ # be fine. If you never call shutdown, then queued or in-progress work
43
+ # may be abandoned when the program ends, which would be bad.
44
+ #
45
+ # 7) We will keep track of total times a block is run in thread pool, and
46
+ # total elapsed (wall) time of running all blocks, so an average_execution_ms
47
+ # time can be given. #average_execution_ms may be inaccurate if called when
48
+ # threads are still executing, as it's not entirely thread safe (may get
49
+ # an off by one as to total iterations)
50
+ class ThreadPool
51
+ attr_reader :pool_size, :queue_capacity
52
+
53
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
+ # work in caller thread.
55
+ def initialize(pool_size)
56
+ unless pool_size.nil? || pool_size == 0
57
+ @pool_size = pool_size.to_i
58
+ @queue_capacity = pool_size * 3
59
+
60
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
61
+ :min_threads => @pool_size,
62
+ :max_threads => @pool_size,
63
+ :max_queue => @queue_capacity,
64
+ :fallback_policy => :caller_runs
65
+ )
66
+
67
+ # A thread-safe queue to collect exceptions cross-threads.
68
+ # We really only need to save the first exception, but a queue
69
+ # is a convenient way to store a value concurrency-safely, and
70
+ # might as well store all of them.
71
+ @exceptions_caught_queue = Queue.new
72
+ end
73
+ end
74
+
75
+ # Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
76
+ # gets executed in the calling thread.
77
+ #
78
+ # There are actually two 'maybes':
79
+ #
80
+ # * If Traject::ThreadPool was configured with null thread pool, then ALL
81
+ # work will be executed in calling thread.
82
+ #
83
+ # * If there is a thread pool, but it's work queue is full, then a job
84
+ # will be executed in calling thread (because we configured our java
85
+ # thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
86
+ #
87
+ # You can pass arbitrary arguments to the method, that will then be passed
88
+ # to your block -- similar to how ruby Thread.new works. This is convenient
89
+ # for creating variables unique to the block that won't be shared outside
90
+ # the thread:
91
+ #
92
+ # thread_pool.maybe_in_thread_pool(x, y) do |x1, y1|
93
+ # 100.times do
94
+ # something_with(x1)
95
+ # end
96
+ # end
97
+ # x = "someting else"
98
+ # # If we hadn't passed args with block, and had just
99
+ # # used x in the block, it'd be the SAME x as this one,
100
+ # # and would be pointing to a different string now!
101
+ #
102
+ # Note, that just makes block-local variables, it doesn't
103
+ # help you with whether a data structure itself is thread safe.
104
+ def maybe_in_thread_pool(*args)
105
+ start_t = Time.now
106
+
107
+ if @thread_pool
108
+ @thread_pool.post do
109
+ begin
110
+ yield(*args)
111
+ rescue Exception => e
112
+ collect_exception(e)
113
+ end
114
+ end
115
+ else
116
+ yield(*args)
117
+ end
118
+
119
+ end
120
+
121
+
122
+ # thread-safe way of storing an exception, to raise
123
+ # later in a different thread. We don't guarantee
124
+ # that we can store more than one at a time, only
125
+ # the first one recorded may be stored.
126
+ def collect_exception(e)
127
+ @exceptions_caught_queue.push(e)
128
+ end
129
+
130
+ # If there's a stored collected exception, raise it
131
+ # again now. Call this to re-raise exceptions caught in
132
+ # other threads in the thread of your choice.
133
+ #
134
+ # If you call this method on a ThreadPool initialized with nil
135
+ # as a non-functioning threadpool -- then this method is just
136
+ # a no-op.
137
+ def raise_collected_exception!
138
+ if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
139
+ e = @exceptions_caught_queue.pop
140
+ raise e
141
+ end
142
+ end
143
+
144
+ # shutdown threadpool, and wait for all work to complete.
145
+ # this one is also a no-op if you have a null ThreadPool that
146
+ # doesn't really have a threadpool at all.
147
+ #
148
+ # returns elapsed time in seconds it took to shutdown
149
+ def shutdown_and_wait
150
+ start_t = Time.now
151
+
152
+ if @thread_pool
153
+ @thread_pool.shutdown
154
+ @thread_pool.wait_for_termination
155
+ end
156
+
157
+ return (Time.now - start_t)
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,267 @@
1
+ require 'traject'
2
+
3
+ require 'yaml'
4
+ require 'dot-properties'
5
+
6
+
7
+ module Traject
8
+ # A TranslationMap is basically just something that has a hash-like #[]
9
+ # method to map from input strings to output strings:
10
+ #
11
+ # translation_map["some_input"] #=> some_output
12
+ #
13
+ # Input is assumed to always be string, output is either string
14
+ # or array of strings.
15
+ #
16
+ # What makes it more useful than a stunted hash is it's ability to load
17
+ # the hash definitions from configuration files, either pure ruby,
18
+ # yaml, or java .properties file (not all .properties features may
19
+ # be supported, we use dot-properties gem for reading)
20
+ #
21
+ # traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
22
+ # that will automatically find and use a translation map on the resulting data:
23
+ #
24
+ # extract_marc("040a", :translation_map => "languages")
25
+ #
26
+ # Or you can always create one yourself and use it how you like:
27
+ #
28
+ # map = TranslationMap.new("languages")
29
+ #
30
+ # In either case, TranslationMap will look for a file named, in that example,
31
+ # `languages.rb` or `languages.yaml` or `languages.properties`,
32
+ # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
33
+ #
34
+ # * Also looks for "/translation_maps" subdir in load paths, so
35
+ # for instance you can have a gem that keeps translation maps
36
+ # in ./lib/translation_maps, and it Just Works.
37
+ #
38
+ # * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
39
+ # it'll use whichever it finds (allows calling code to not care which is used).
40
+ #
41
+ # Ruby files just need to have their last line eval to a hash. They file
42
+ # will be run through `eval`, don't do it with untrusted content (naturally)
43
+ #
44
+ # You can also pass in a Hash for consistency to TranslationMap.new, although
45
+ # I don't know why you'd want to.
46
+ #
47
+ # ## Special default handling
48
+ #
49
+ # The key "__default__" in the hash is treated specially. If set to a string,
50
+ # that string will be returned by the TranslationMap for any input not otherwise
51
+ # included. If set to the special string "__passthrough__", then for input not
52
+ # mapped, the original input string will be returned.
53
+ #
54
+ # This is most useful for YAML definition files, if you are using an actual ruby
55
+ # hash, you could just set the hash to do what you want using Hash#default_proc
56
+ # etc.
57
+ #
58
+ # Or, when calling TranslationMap.new(), you can pass in options over-riding special
59
+ # key too:
60
+ #
61
+ # TranslationMap.new("something", :default => "foo")
62
+ # TranslationMap.new("something", :default => :passthrough)
63
+ #
64
+ # ## Output: String or array of strings
65
+ #
66
+ # The output can be a string or an array of strings, or nil. It should not be anything else.
67
+ # When used with the #translate_array! method, one string can be replaced by multiple values
68
+ # (array of strings) or removed (nil)
69
+ #
70
+ # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
71
+ #
72
+ # ## Caching
73
+ #
74
+ # Lookup and loading of configuration files will be cached, for efficiency.
75
+ # You can reset with `TranslationMap.reset_cache!`
76
+ #
77
+ # ## YAML example:
78
+ #
79
+ # key: value
80
+ # key2: value2 multiple words fine
81
+ # key2b: "Although you can use quotes if you want: Or need."
82
+ # key3:
83
+ # - array
84
+ # - of
85
+ # - values look like this
86
+ #
87
+ # ## Alternatives
88
+ # `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
89
+ # simple key-value stores with optional default values.
90
+ #
91
+ # If you need more complex translation, you can simply use `#map!`
92
+ # or its kin to work on the `accumulator` in a block
93
+ #
94
+ #
95
+ #
96
+ # # get a lousy language detection of any vernacular title
97
+ # require 'whatlanguage'
98
+ # wl = WhatLanguage.new(:all)
99
+ # to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
100
+ # # accumulator is already filled with the values of any 880s that reference a 245 because
101
+ # # of the call to #extract_marc
102
+ # acc.map! {|x| wl.language(x) }
103
+ # acc.uniq!
104
+ # end
105
+ # Within the block, you may also be interested in using:
106
+ # * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
107
+ # * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
108
+ class TranslationMap
109
+ class Cache
110
+ def initialize
111
+ @cached = Hash.new
112
+ end
113
+
114
+ # Returns an actual Hash -- or nil if none found.
115
+ def lookup(path)
116
+ unless @cached.has_key?(path)
117
+ @cached[path] = _lookup!(path)
118
+ end
119
+ return @cached[path]
120
+ end
121
+
122
+ # force lookup, without using cache.
123
+ # used by cache. Returns the actual hash.
124
+ # Returns nil if none found.
125
+ # May raise on syntax error in file being loaded.
126
+ def _lookup!(path)
127
+ found = nil
128
+
129
+ $LOAD_PATH.each do |base|
130
+ rb_file = File.join( base, "translation_maps", "#{path}.rb" )
131
+ yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
132
+ prop_file = File.join(base, "translation_maps", "#{path}.properties" )
133
+
134
+ if File.exists? rb_file
135
+ found = eval( File.open(rb_file).read , binding, rb_file )
136
+ break
137
+ elsif File.exists? yaml_file
138
+ found = YAML.load_file(yaml_file)
139
+ break
140
+ elsif File.exists? prop_file
141
+ found = Traject::TranslationMap.read_properties(prop_file)
142
+ break
143
+ end
144
+ end
145
+
146
+ # Cached hash can't be mutated without weird consequences, let's
147
+ # freeze it!
148
+ found.freeze if found
149
+
150
+ return found
151
+ end
152
+
153
+ def reset_cache!
154
+ @cached.clear
155
+ end
156
+
157
+ end
158
+
159
+ attr_reader :hash
160
+ attr_reader :default
161
+
162
+ class << self
163
+ attr_accessor :cache
164
+ def reset_cache!
165
+ cache.reset_cache!
166
+ end
167
+ end
168
+ self.cache = Cache.new
169
+
170
+
171
+ def initialize(defn, options = {})
172
+ if defn.kind_of? Hash
173
+ @hash = defn
174
+ elsif defn.kind_of? self.class
175
+ @hash = defn.to_hash
176
+ @default = defn.default
177
+ else
178
+ @hash = self.class.cache.lookup(defn)
179
+ raise NotFound.new(defn) if @hash.nil?
180
+ end
181
+
182
+ if options[:default]
183
+ @default = options[:default]
184
+ elsif @hash.has_key? "__default__"
185
+ @default = @hash["__default__"]
186
+ end
187
+ end
188
+
189
+ def [](key)
190
+ if self.default && (! @hash.has_key?(key))
191
+ if self.default == "__passthrough__"
192
+ return key
193
+ else
194
+ return self.default
195
+ end
196
+ end
197
+
198
+ @hash[key]
199
+ end
200
+ alias_method :map, :[]
201
+
202
+ # Returns a dup of internal hash, dup so you can modify it
203
+ # if you like.
204
+ def to_hash
205
+ dup = @hash.dup
206
+ dup.delete("__default__")
207
+ dup
208
+ end
209
+
210
+ # Run every element of an array through this translation map,
211
+ # return the resulting array. If translation map returns nil,
212
+ # original element will be missing from output.
213
+ #
214
+ # If an input maps to an array, each element of the array will be flattened
215
+ # into the output.
216
+ #
217
+ # If an input maps to nil, it will cause the input element to be removed
218
+ # entirely.
219
+ def translate_array(array)
220
+ array.each_with_object([]) do |input_element, output_array|
221
+ output_element = self.map(input_element)
222
+ if output_element.kind_of? Array
223
+ output_array.concat output_element
224
+ elsif ! output_element.nil?
225
+ output_array << output_element
226
+ end
227
+ end
228
+ end
229
+
230
+ def translate_array!(array)
231
+ array.replace( self.translate_array(array))
232
+ end
233
+
234
+ # Return a new TranslationMap that results from merging argument on top of self.
235
+ # Can be useful for taking an existing translation map, but merging a few
236
+ # overrides on top.
237
+ #
238
+ # merged_map = TranslationMap.new(something).merge TranslationMap.new(else)
239
+ # #...
240
+ # merged_map.translate_array(something) # etc
241
+ #
242
+ # If a default is set in the second map, it will merge over the first too.
243
+ #
244
+ # You can also pass in a plain hash as an arg, instead of an existing TranslationMap:
245
+ #
246
+ # TranslationMap.new(something).merge("overridden_key" => "value", "a" => "")
247
+ def merge(other_map)
248
+ default = other_map.default || self.default
249
+ TranslationMap.new(self.to_hash.merge(other_map.to_hash), :default => default)
250
+ end
251
+
252
+ class NotFound < Exception
253
+ def initialize(path)
254
+ super("No translation map definition file found at 'translation_maps/#{path}.[rb|yaml|properties]' in load path: #{$LOAD_PATH}")
255
+ end
256
+ end
257
+
258
+ protected
259
+
260
+ # We use dot-properties gem for reading .properties files,
261
+ # return a hash.
262
+ def self.read_properties(file_name)
263
+ return DotProperties.load(file_name).to_h
264
+ end
265
+
266
+ end
267
+ end
@@ -0,0 +1,52 @@
1
+ require 'traject'
2
+
3
+ module Traject
4
+ # Just some internal utility methods
5
+ module Util
6
+
7
+ def self.exception_to_log_message(e)
8
+ indent = " "
9
+
10
+ msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
11
+ msg += indent + e.backtrace.first + "\n"
12
+
13
+ if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause )
14
+ caused_by = e.getRootCause
15
+ msg += indent + "Caused by\n"
16
+ msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
17
+ msg += indent + caused_by.backtrace.first + "\n"
18
+ end
19
+
20
+ return msg
21
+ end
22
+
23
+ # From ruby #caller method, you get an array. Pass one line
24
+ # of the array here, get just file and line number out.
25
+ def self.extract_caller_location(str)
26
+ str.split(':in `').first
27
+ end
28
+
29
+
30
+
31
+ # Ruby stdlib queue lacks a 'drain' function, we write one.
32
+ #
33
+ # Removes everything currently in the ruby stdlib queue, and returns
34
+ # it an array. Should be concurrent-safe, but queue may still have
35
+ # some things in it after drain, if there are concurrent writers.
36
+ def self.drain_queue(queue)
37
+ result = []
38
+
39
+ queue_size = queue.size
40
+ begin
41
+ queue_size.times do
42
+ result << queue.deq(:raise_if_empty)
43
+ end
44
+ rescue ThreadError
45
+ # Need do nothing, queue was concurrently popped, no biggie
46
+ end
47
+
48
+ return result
49
+ end
50
+
51
+ end
52
+ end