traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -1,8 +1,11 @@
1
- require 'hashie'
1
+ require 'yell'
2
2
 
3
3
  require 'traject'
4
4
  require 'traject/qualified_const_get'
5
+
6
+ require 'traject/indexer/settings'
5
7
  require 'traject/marc_reader'
8
+ require 'traject/marc4j_reader'
6
9
  require 'traject/json_writer'
7
10
  require 'traject/solrj_writer'
8
11
 
@@ -28,11 +31,15 @@ require 'traject/macros/basic'
28
31
  #
29
32
  #
30
33
  # A Writer is any class that:
31
- # 1) Has a one-argument initializer taking a Settings hash.
34
+ # 1) Has a one-argument initializer taking a Settings hash. (The logger
35
+ # is provided to the Writer in settings["logger"])
32
36
  # 2) Responds to a one argument #put method, where the argument is
33
- # a hash of mapped keys/values. The writer should write them
37
+ # a Traject::Indexer::Context, containing an #output_hash
38
+ # hash of mapped keys/values. The writer should write them
34
39
  # to the appropriate place.
35
40
  # 3) Responds to a #close method, called when we're done.
41
+ # 4) Optionally implements a #skipped_record_count method, returning int count of records
42
+ # that were skipped due to errors (and presumably logged)
36
43
  #
37
44
  # The default writer (will be) the SolrWriter , which is configured
38
45
  # through additional Settings as well. A JsonWriter is also available,
@@ -55,8 +62,9 @@ class Traject::Indexer
55
62
  include Traject::Macros::Basic
56
63
 
57
64
 
58
- def initialize
59
- @settings = Settings.new(self.class.default_settings)
65
+ # optional hash or Traject::Indexer::Settings object of settings.
66
+ def initialize(arg_settings = {})
67
+ @settings = Settings.new(arg_settings)
60
68
  @index_steps = []
61
69
  end
62
70
 
@@ -88,58 +96,270 @@ class Traject::Indexer
88
96
  return @settings
89
97
  end
90
98
 
99
+ def logger
100
+ @logger ||= create_logger
101
+ end
102
+ attr_writer :logger
103
+
104
+
105
+ # Just calculates the arg that's gonna be given to Yell.new
106
+ # or SomeLogger.new
107
+ def logger_argument
108
+ specified = settings["log.file"] || "STDERR"
109
+
110
+ case specified
111
+ when "STDOUT" then STDOUT
112
+ when "STDERR" then STDERR
113
+ else specified
114
+ end
115
+ end
116
+
117
+ # Second arg to Yell.new, options hash, calculated from
118
+ # settings
119
+ def logger_options
120
+ # formatter, default is fairly basic
121
+ format = settings["log.format"] || "%d %5L %m"
122
+ format = case format
123
+ when "false" then false
124
+ when "" then nil
125
+ else format
126
+ end
127
+
128
+ level = settings["log.level"] || "info"
129
+
130
+ {:format => format, :level => level}
131
+ end
132
+
133
+ # Create logger according to settings
134
+ def create_logger
135
+ # log everything to STDERR or specified logfile
136
+ logger = Yell.new( logger_argument, logger_options )
137
+ # ADDITIONALLY log error and higher to....
138
+ if settings["log.error_file"]
139
+ logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
140
+ end
141
+
142
+ return logger
143
+ end
144
+
145
+
91
146
  # Used to define an indexing mapping.
92
147
  def to_field(field_name, aLambda = nil, &block)
148
+
149
+ if field_name.nil? || field_name.empty?
150
+ raise ArgumentError.new("to_field requires a non-blank first argument, field name")
151
+ end
152
+ [aLambda, block].each do |proc|
153
+ # allow negative arity, meaning variable/optional, trust em on that.
154
+ # but for positive arrity, we need 2 or 3 args
155
+ if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
156
+ raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
157
+ end
158
+ end
159
+
160
+
93
161
  @index_steps << {
94
162
  :field_name => field_name.to_s,
95
163
  :lambda => aLambda,
96
- :block => block
164
+ :block => block,
165
+ :type => :to_field,
166
+ :source_location => Traject::Util.extract_caller_location(caller.first)
167
+ }
168
+ end
169
+
170
+ def each_record(aLambda = nil, &block)
171
+ # arity check
172
+ [aLambda, block].each do |proc|
173
+ # allow negative arity, meaning variable/optional, trust em on that.
174
+ # but for positive arrity, we need 1 or 2 args
175
+ if proc && (proc.arity == 0 || proc.arity > 2)
176
+ raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
177
+ end
178
+ end
179
+
180
+ @index_steps << {
181
+ :lambda => aLambda,
182
+ :block => block,
183
+ :type => :each_record,
184
+ :source_location => Traject::Util.extract_caller_location(caller.first)
97
185
  }
98
186
  end
99
187
 
100
- # Processes a single record, according to indexing rules
101
- # set up in this Indexer. Returns a hash whose values are
102
- # Arrays, and keys are strings.
188
+
189
+ # Processes a single record according to indexing rules set up in
190
+ # this indexer. Returns the output hash (a hash whose keys are
191
+ # string fields, and values are arrays of one or more values in that field)
103
192
  #
193
+ # This is a convenience shortcut for #map_to_context! -- use that one
194
+ # if you want to provide addtional context
195
+ # like position, and/or get back the full context.
104
196
  def map_record(record)
105
197
  context = Context.new(:source_record => record, :settings => settings)
198
+ map_to_context!(context)
199
+ return context.output_hash
200
+ end
106
201
 
202
+ # Maps a single record INTO the second argument, a Traject::Indexer::Context.
203
+ #
204
+ # Context must be passed with a #source_record and #settings, and optionally
205
+ # a #position.
206
+ #
207
+ # Context will be mutated by this method, most significantly by adding
208
+ # an #output_hash, a hash from fieldname to array of values in that field.
209
+ #
210
+ # Pass in a context with a set #position if you want that to be available
211
+ # to mapping routines.
212
+ #
213
+ # Returns the context passed in as second arg, as a convenience for chaining etc.
214
+ def map_to_context!(context)
107
215
  @index_steps.each do |index_step|
108
- accumulator = []
109
- field_name = index_step[:field_name]
110
- context.field_name = field_name
111
-
112
- # Might have a lambda arg AND a block, we execute in order,
113
- # with same accumulator.
114
- [index_step[:lambda], index_step[:block]].each do |aProc|
115
- if aProc
116
- case aProc.arity
117
- when 1 then aProc.call(record)
118
- when 2 then aProc.call(record, accumulator)
119
- else aProc.call(record, accumulator, context)
216
+ if index_step[:type] == :to_field
217
+
218
+ accumulator = []
219
+ context.field_name = index_step[:field_name]
220
+
221
+ # Might have a lambda arg AND a block, we execute in order,
222
+ # with same accumulator.
223
+
224
+ [index_step[:lambda], index_step[:block]].each do |aProc|
225
+ if aProc
226
+ log_mapping_errors(context, index_step, aProc) do
227
+ if aProc.arity == 2
228
+ aProc.call(context.source_record, accumulator)
229
+ else
230
+ aProc.call(context.source_record, accumulator, context)
231
+ end
232
+ end
120
233
  end
121
234
  end
235
+ (context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
236
+ context.field_name = nil
122
237
 
123
- end
238
+ elsif index_step[:type] == :each_record
239
+
240
+ # one or two arg
241
+ [index_step[:lambda], index_step[:block]].each do |aProc|
242
+ if aProc
243
+ log_mapping_errors(context, index_step, aProc) do
244
+ if aProc.arity == 1
245
+ aProc.call(context.source_record)
246
+ else
247
+ aProc.call(context.source_record, context)
248
+ end
249
+ end
250
+ end
251
+ end
124
252
 
125
- (context.output_hash[field_name] ||= []).concat accumulator
126
- context.field_name = nil
253
+ else
254
+ raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
255
+ end
127
256
  end
128
257
 
129
- return context.output_hash
258
+ return context
259
+ end
260
+
261
+ # just a wrapper that captures and records any unexpected
262
+ # errors raised in mapping, along with contextual information
263
+ # on record and location in source file of mapping rule.
264
+ #
265
+ # Re-raises error at the moment.
266
+ #
267
+ # log_errors(context, some_lambda) do
268
+ # all_sorts_of_stuff # that will have errors logged
269
+ # end
270
+ def log_mapping_errors(context, index_step, aProc)
271
+ begin
272
+ yield
273
+ rescue Exception => e
274
+ msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
275
+
276
+ conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
277
+
278
+ msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
279
+ msg += Traject::Util.exception_to_log_message(e)
280
+
281
+ logger.error msg
282
+ logger.debug "Record: " + context.source_record.to_s
283
+
284
+ raise e
285
+ end
130
286
  end
131
287
 
132
288
  # Processes a stream of records, reading from the configured Reader,
133
289
  # mapping according to configured mapping rules, and then writing
134
290
  # to configured Writer.
291
+ #
292
+ # returns 'false' as a signal to command line to return non-zero exit code
293
+ # for some reason (reason found in logs, presumably). This particular mechanism
294
+ # is open to complexification, starting simple. We do need SOME way to return
295
+ # non-zero to command line.
296
+ #
135
297
  def process(io_stream)
298
+ settings.fill_in_defaults!
299
+
300
+ count = 0
301
+ start_time = batch_start_time = Time.now
302
+ logger.info "beginning Indexer#process with settings: #{settings.inspect}"
303
+
136
304
  reader = self.reader!(io_stream)
137
305
  writer = self.writer!
138
306
 
139
- reader.each do |record|
140
- writer.put map_record(record)
307
+ thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
308
+
309
+ logger.info " with reader: #{reader.class.name} and writer: #{writer.class.name}"
310
+
311
+ reader.each do |record; position|
312
+ count += 1
313
+
314
+ # have to use a block local var, so the changing `count` one
315
+ # doesn't get caught in the closure. Weird, yeah.
316
+ position = count
317
+
318
+ thread_pool.raise_collected_exception!
319
+
320
+ if settings["debug_ascii_progress"].to_s == "true"
321
+ $stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
322
+ end
323
+
324
+ if settings["log.batch_progress"] && (count % settings["log.batch_progress"].to_i == 0)
325
+ batch_rps = settings["log.batch_progress"].to_i / (Time.now - batch_start_time)
326
+ overall_rps = count / (Time.now - start_time)
327
+ logger.info "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
328
+ batch_start_time = Time.now
329
+ end
330
+
331
+ # we have to use this weird lambda to properly "capture" the count, instead
332
+ # of having it be bound to the original variable in a non-threadsafe way.
333
+ # This is confusing, I might not be understanding things properly, but that's where i am.
334
+ #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
335
+ thread_pool.maybe_in_thread_pool do
336
+ context = Context.new(:source_record => record, :settings => settings, :position => position)
337
+ map_to_context!(context)
338
+ writer.put context
339
+ end
340
+
141
341
  end
342
+ $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
343
+
344
+ logger.debug "Shutting down #processing mapper threadpool..."
345
+ thread_pool.shutdown_and_wait
346
+ logger.debug "#processing mapper threadpool shutdown complete."
347
+
348
+ thread_pool.raise_collected_exception!
349
+
350
+
142
351
  writer.close if writer.respond_to?(:close)
352
+
353
+ elapsed = Time.now - start_time
354
+ avg_rps = (count / elapsed)
355
+ logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
356
+
357
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
358
+ logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
359
+ return false
360
+ end
361
+
362
+ return true
143
363
  end
144
364
 
145
365
  def reader_class
@@ -159,33 +379,21 @@ class Traject::Indexer
159
379
  # Instantiate a Traject Reader, using class set
160
380
  # in #reader_class, initialized with io_stream passed in
161
381
  def reader!(io_stream)
162
- return reader_class.new(io_stream, settings)
382
+ return reader_class.new(io_stream, settings.merge("logger" => logger))
163
383
  end
164
384
 
165
385
  # Instantiate a Traject Writer, suing class set in #writer_class
166
386
  def writer!
167
- return writer_class.new(settings)
387
+ return writer_class.new(settings.merge("logger" => logger))
168
388
  end
169
389
 
170
- def self.default_settings
171
- {
172
- "reader_class_name" => "Traject::MarcReader",
173
- "writer_class_name" => "Traject::SolrJWriter"
174
- }
390
+ # get a printable id from record for error logging.
391
+ # Maybe override this for a future XML version.
392
+ def id_string(record)
393
+ record && record['001'] && record['001'].value.to_s
175
394
  end
176
395
 
177
396
 
178
-
179
- # Enhanced with a few features from Hashie, to make it for
180
- # instance string/symbol indifferent
181
- class Settings < Hash
182
- include Hashie::Extensions::MergeInitializer # can init with hash
183
- include Hashie::Extensions::IndifferentAccess
184
-
185
- # Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
186
- alias_method :store, :indifferent_writer
187
- end
188
-
189
397
  # Represents the context of a specific record being indexed, passed
190
398
  # to indexing logic blocks
191
399
  #
@@ -203,5 +411,7 @@ class Traject::Indexer
203
411
 
204
412
  attr_accessor :clipboard, :output_hash
205
413
  attr_accessor :field_name, :source_record, :settings
414
+ # 1-based position in stream of processed records.
415
+ attr_accessor :position
206
416
  end
207
417
  end
@@ -20,7 +20,9 @@ class Traject::JsonWriter
20
20
  @settings = argSettings
21
21
  end
22
22
 
23
- def put(hash)
23
+ def put(context)
24
+ hash = context.output_hash
25
+
24
26
  serialized =
25
27
  if settings["json_writer.pretty_print"]
26
28
  JSON.pretty_generate(hash)
@@ -34,7 +36,7 @@ class Traject::JsonWriter
34
36
  unless defined? @output_file
35
37
  @output_file =
36
38
  if settings["output_file"]
37
- File.open(settings["output_file"])
39
+ File.open(settings["output_file"], 'w:UTF-8')
38
40
  elsif settings["output_stream"]
39
41
  settings["output_stream"]
40
42
  else
@@ -18,6 +18,13 @@ module Traject::Macros
18
18
  # Second arg is optional options, including options valid on MarcExtractor.new,
19
19
  # and others. (TODO)
20
20
  #
21
+ # * :first => true: take only first value
22
+ # * :translation_map => String: translate with named translation map looked up in load
23
+ # path, uses Tranject::TranslationMap.new(translation_map_arg)
24
+ # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
25
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation
26
+ # * :default => String: if otherwise empty, add default value
27
+ #
21
28
  # Examples:
22
29
  #
23
30
  # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
@@ -26,11 +33,12 @@ module Traject::Macros
26
33
  def extract_marc(spec, options = {})
27
34
  only_first = options.delete(:first)
28
35
  trim_punctuation = options.delete(:trim_punctuation)
36
+ default_value = options.delete(:default)
29
37
 
30
38
  # We create the TranslationMap here on load, not inside the closure
31
39
  # where it'll be called for every record. Since TranslationMap is supposed
32
40
  # to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
33
- # exceptions to raise on load.
41
+ # exceptions to raise on load.
34
42
  if translation_map_arg = options.delete(:translation_map)
35
43
  translation_map = Traject::TranslationMap.new(translation_map_arg)
36
44
  end
@@ -49,6 +57,10 @@ module Traject::Macros
49
57
  if trim_punctuation
50
58
  accumulator.collect! {|s| Marc21.trim_punctuation(s)}
51
59
  end
60
+
61
+ if default_value && accumulator.empty?
62
+ accumulator << default_value
63
+ end
52
64
  end
53
65
  end
54
66
 
@@ -97,7 +109,7 @@ module Traject::Macros
97
109
  # All fields in from-to must be marc DATA (not control fields), or weirdness
98
110
  #
99
111
  # Can always run this thing multiple times on the same field if you need
100
- # non-contiguous ranges of fields.
112
+ # non-contiguous ranges of fields.
101
113
  def extract_all_marc_values(options = {})
102
114
  options = {:from => "100", :to => "899", :seperator => ' '}.merge(options)
103
115
 
@@ -123,15 +135,15 @@ module Traject::Macros
123
135
  # pretty simple.
124
136
  #
125
137
  # Removes
126
- # * trailing: comma, slash, semicolon, colon (possibly followed by whitespace)
127
- # * trailing period if it is preceded by at least three letters (possibly followed by whitespace)
138
+ # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
139
+ # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
128
140
  # * single square bracket characters if they are the start and/or end
129
141
  # chars and there are no internal square brackets.
130
142
  #
131
143
  # Returns altered string, doesn't change original arg.
132
144
  def self.trim_punctuation(str)
133
- str = str.sub(/[ ,\/;:] *\Z/, '')
134
- str = str.sub(/(\w\w\w)\. *\Z/, '\1')
145
+ str = str.sub(/ *[ ,\/;:] *\Z/, '')
146
+ str = str.sub(/ *(\w\w\w)\. *\Z/, '\1')
135
147
  str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
136
148
  return str
137
149
  end