traject 0.0.2 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -1,8 +1,11 @@
1
- require 'hashie'
1
+ require 'yell'
2
2
 
3
3
  require 'traject'
4
4
  require 'traject/qualified_const_get'
5
+
6
+ require 'traject/indexer/settings'
5
7
  require 'traject/marc_reader'
8
+ require 'traject/marc4j_reader'
6
9
  require 'traject/json_writer'
7
10
  require 'traject/solrj_writer'
8
11
 
@@ -28,11 +31,15 @@ require 'traject/macros/basic'
28
31
  #
29
32
  #
30
33
  # A Writer is any class that:
31
- # 1) Has a one-argument initializer taking a Settings hash.
34
+ # 1) Has a one-argument initializer taking a Settings hash. (The logger
35
+ # is provided to the Writer in settings["logger"])
32
36
  # 2) Responds to a one argument #put method, where the argument is
33
- # a hash of mapped keys/values. The writer should write them
37
+ # a Traject::Indexer::Context, containing an #output_hash
38
+ # hash of mapped keys/values. The writer should write them
34
39
  # to the appropriate place.
35
40
  # 3) Responds to a #close method, called when we're done.
41
+ # 4) Optionally implements a #skipped_record_count method, returning int count of records
42
+ # that were skipped due to errors (and presumably logged)
36
43
  #
37
44
  # The default writer (will be) the SolrWriter , which is configured
38
45
  # through additional Settings as well. A JsonWriter is also available,
@@ -55,8 +62,9 @@ class Traject::Indexer
55
62
  include Traject::Macros::Basic
56
63
 
57
64
 
58
- def initialize
59
- @settings = Settings.new(self.class.default_settings)
65
+ # optional hash or Traject::Indexer::Settings object of settings.
66
+ def initialize(arg_settings = {})
67
+ @settings = Settings.new(arg_settings)
60
68
  @index_steps = []
61
69
  end
62
70
 
@@ -88,58 +96,270 @@ class Traject::Indexer
88
96
  return @settings
89
97
  end
90
98
 
99
+ def logger
100
+ @logger ||= create_logger
101
+ end
102
+ attr_writer :logger
103
+
104
+
105
+ # Just calculates the arg that's gonna be given to Yell.new
106
+ # or SomeLogger.new
107
+ def logger_argument
108
+ specified = settings["log.file"] || "STDERR"
109
+
110
+ case specified
111
+ when "STDOUT" then STDOUT
112
+ when "STDERR" then STDERR
113
+ else specified
114
+ end
115
+ end
116
+
117
+ # Second arg to Yell.new, options hash, calculated from
118
+ # settings
119
+ def logger_options
120
+ # formatter, default is fairly basic
121
+ format = settings["log.format"] || "%d %5L %m"
122
+ format = case format
123
+ when "false" then false
124
+ when "" then nil
125
+ else format
126
+ end
127
+
128
+ level = settings["log.level"] || "info"
129
+
130
+ {:format => format, :level => level}
131
+ end
132
+
133
+ # Create logger according to settings
134
+ def create_logger
135
+ # log everything to STDERR or specified logfile
136
+ logger = Yell.new( logger_argument, logger_options )
137
+ # ADDITIONALLY log error and higher to....
138
+ if settings["log.error_file"]
139
+ logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
140
+ end
141
+
142
+ return logger
143
+ end
144
+
145
+
91
146
  # Used to define an indexing mapping.
92
147
  def to_field(field_name, aLambda = nil, &block)
148
+
149
+ if field_name.nil? || field_name.empty?
150
+ raise ArgumentError.new("to_field requires a non-blank first argument, field name")
151
+ end
152
+ [aLambda, block].each do |proc|
153
+ # allow negative arity, meaning variable/optional, trust em on that.
154
+ # but for positive arrity, we need 2 or 3 args
155
+ if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
156
+ raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
157
+ end
158
+ end
159
+
160
+
93
161
  @index_steps << {
94
162
  :field_name => field_name.to_s,
95
163
  :lambda => aLambda,
96
- :block => block
164
+ :block => block,
165
+ :type => :to_field,
166
+ :source_location => Traject::Util.extract_caller_location(caller.first)
167
+ }
168
+ end
169
+
170
+ def each_record(aLambda = nil, &block)
171
+ # arity check
172
+ [aLambda, block].each do |proc|
173
+ # allow negative arity, meaning variable/optional, trust em on that.
174
+ # but for positive arrity, we need 1 or 2 args
175
+ if proc && (proc.arity == 0 || proc.arity > 2)
176
+ raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
177
+ end
178
+ end
179
+
180
+ @index_steps << {
181
+ :lambda => aLambda,
182
+ :block => block,
183
+ :type => :each_record,
184
+ :source_location => Traject::Util.extract_caller_location(caller.first)
97
185
  }
98
186
  end
99
187
 
100
- # Processes a single record, according to indexing rules
101
- # set up in this Indexer. Returns a hash whose values are
102
- # Arrays, and keys are strings.
188
+
189
+ # Processes a single record according to indexing rules set up in
190
+ # this indexer. Returns the output hash (a hash whose keys are
191
+ # string fields, and values are arrays of one or more values in that field)
103
192
  #
193
+ # This is a convenience shortcut for #map_to_context! -- use that one
194
+ # if you want to provide addtional context
195
+ # like position, and/or get back the full context.
104
196
  def map_record(record)
105
197
  context = Context.new(:source_record => record, :settings => settings)
198
+ map_to_context!(context)
199
+ return context.output_hash
200
+ end
106
201
 
202
+ # Maps a single record INTO the second argument, a Traject::Indexer::Context.
203
+ #
204
+ # Context must be passed with a #source_record and #settings, and optionally
205
+ # a #position.
206
+ #
207
+ # Context will be mutated by this method, most significantly by adding
208
+ # an #output_hash, a hash from fieldname to array of values in that field.
209
+ #
210
+ # Pass in a context with a set #position if you want that to be available
211
+ # to mapping routines.
212
+ #
213
+ # Returns the context passed in as second arg, as a convenience for chaining etc.
214
+ def map_to_context!(context)
107
215
  @index_steps.each do |index_step|
108
- accumulator = []
109
- field_name = index_step[:field_name]
110
- context.field_name = field_name
111
-
112
- # Might have a lambda arg AND a block, we execute in order,
113
- # with same accumulator.
114
- [index_step[:lambda], index_step[:block]].each do |aProc|
115
- if aProc
116
- case aProc.arity
117
- when 1 then aProc.call(record)
118
- when 2 then aProc.call(record, accumulator)
119
- else aProc.call(record, accumulator, context)
216
+ if index_step[:type] == :to_field
217
+
218
+ accumulator = []
219
+ context.field_name = index_step[:field_name]
220
+
221
+ # Might have a lambda arg AND a block, we execute in order,
222
+ # with same accumulator.
223
+
224
+ [index_step[:lambda], index_step[:block]].each do |aProc|
225
+ if aProc
226
+ log_mapping_errors(context, index_step, aProc) do
227
+ if aProc.arity == 2
228
+ aProc.call(context.source_record, accumulator)
229
+ else
230
+ aProc.call(context.source_record, accumulator, context)
231
+ end
232
+ end
120
233
  end
121
234
  end
235
+ (context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
236
+ context.field_name = nil
122
237
 
123
- end
238
+ elsif index_step[:type] == :each_record
239
+
240
+ # one or two arg
241
+ [index_step[:lambda], index_step[:block]].each do |aProc|
242
+ if aProc
243
+ log_mapping_errors(context, index_step, aProc) do
244
+ if aProc.arity == 1
245
+ aProc.call(context.source_record)
246
+ else
247
+ aProc.call(context.source_record, context)
248
+ end
249
+ end
250
+ end
251
+ end
124
252
 
125
- (context.output_hash[field_name] ||= []).concat accumulator
126
- context.field_name = nil
253
+ else
254
+ raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
255
+ end
127
256
  end
128
257
 
129
- return context.output_hash
258
+ return context
259
+ end
260
+
261
+ # just a wrapper that captures and records any unexpected
262
+ # errors raised in mapping, along with contextual information
263
+ # on record and location in source file of mapping rule.
264
+ #
265
+ # Re-raises error at the moment.
266
+ #
267
+ # log_errors(context, some_lambda) do
268
+ # all_sorts_of_stuff # that will have errors logged
269
+ # end
270
+ def log_mapping_errors(context, index_step, aProc)
271
+ begin
272
+ yield
273
+ rescue Exception => e
274
+ msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
275
+
276
+ conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
277
+
278
+ msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
279
+ msg += Traject::Util.exception_to_log_message(e)
280
+
281
+ logger.error msg
282
+ logger.debug "Record: " + context.source_record.to_s
283
+
284
+ raise e
285
+ end
130
286
  end
131
287
 
132
288
  # Processes a stream of records, reading from the configured Reader,
133
289
  # mapping according to configured mapping rules, and then writing
134
290
  # to configured Writer.
291
+ #
292
+ # returns 'false' as a signal to command line to return non-zero exit code
293
+ # for some reason (reason found in logs, presumably). This particular mechanism
294
+ # is open to complexification, starting simple. We do need SOME way to return
295
+ # non-zero to command line.
296
+ #
135
297
  def process(io_stream)
298
+ settings.fill_in_defaults!
299
+
300
+ count = 0
301
+ start_time = batch_start_time = Time.now
302
+ logger.info "beginning Indexer#process with settings: #{settings.inspect}"
303
+
136
304
  reader = self.reader!(io_stream)
137
305
  writer = self.writer!
138
306
 
139
- reader.each do |record|
140
- writer.put map_record(record)
307
+ thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
308
+
309
+ logger.info " with reader: #{reader.class.name} and writer: #{writer.class.name}"
310
+
311
+ reader.each do |record; position|
312
+ count += 1
313
+
314
+ # have to use a block local var, so the changing `count` one
315
+ # doesn't get caught in the closure. Weird, yeah.
316
+ position = count
317
+
318
+ thread_pool.raise_collected_exception!
319
+
320
+ if settings["debug_ascii_progress"].to_s == "true"
321
+ $stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
322
+ end
323
+
324
+ if settings["log.batch_progress"] && (count % settings["log.batch_progress"].to_i == 0)
325
+ batch_rps = settings["log.batch_progress"].to_i / (Time.now - batch_start_time)
326
+ overall_rps = count / (Time.now - start_time)
327
+ logger.info "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
328
+ batch_start_time = Time.now
329
+ end
330
+
331
+ # we have to use this weird lambda to properly "capture" the count, instead
332
+ # of having it be bound to the original variable in a non-threadsafe way.
333
+ # This is confusing, I might not be understanding things properly, but that's where i am.
334
+ #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
335
+ thread_pool.maybe_in_thread_pool do
336
+ context = Context.new(:source_record => record, :settings => settings, :position => position)
337
+ map_to_context!(context)
338
+ writer.put context
339
+ end
340
+
141
341
  end
342
+ $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
343
+
344
+ logger.debug "Shutting down #processing mapper threadpool..."
345
+ thread_pool.shutdown_and_wait
346
+ logger.debug "#processing mapper threadpool shutdown complete."
347
+
348
+ thread_pool.raise_collected_exception!
349
+
350
+
142
351
  writer.close if writer.respond_to?(:close)
352
+
353
+ elapsed = Time.now - start_time
354
+ avg_rps = (count / elapsed)
355
+ logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
356
+
357
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
358
+ logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
359
+ return false
360
+ end
361
+
362
+ return true
143
363
  end
144
364
 
145
365
  def reader_class
@@ -159,33 +379,21 @@ class Traject::Indexer
159
379
  # Instantiate a Traject Reader, using class set
160
380
  # in #reader_class, initialized with io_stream passed in
161
381
  def reader!(io_stream)
162
- return reader_class.new(io_stream, settings)
382
+ return reader_class.new(io_stream, settings.merge("logger" => logger))
163
383
  end
164
384
 
165
385
  # Instantiate a Traject Writer, suing class set in #writer_class
166
386
  def writer!
167
- return writer_class.new(settings)
387
+ return writer_class.new(settings.merge("logger" => logger))
168
388
  end
169
389
 
170
- def self.default_settings
171
- {
172
- "reader_class_name" => "Traject::MarcReader",
173
- "writer_class_name" => "Traject::SolrJWriter"
174
- }
390
+ # get a printable id from record for error logging.
391
+ # Maybe override this for a future XML version.
392
+ def id_string(record)
393
+ record && record['001'] && record['001'].value.to_s
175
394
  end
176
395
 
177
396
 
178
-
179
- # Enhanced with a few features from Hashie, to make it for
180
- # instance string/symbol indifferent
181
- class Settings < Hash
182
- include Hashie::Extensions::MergeInitializer # can init with hash
183
- include Hashie::Extensions::IndifferentAccess
184
-
185
- # Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
186
- alias_method :store, :indifferent_writer
187
- end
188
-
189
397
  # Represents the context of a specific record being indexed, passed
190
398
  # to indexing logic blocks
191
399
  #
@@ -203,5 +411,7 @@ class Traject::Indexer
203
411
 
204
412
  attr_accessor :clipboard, :output_hash
205
413
  attr_accessor :field_name, :source_record, :settings
414
+ # 1-based position in stream of processed records.
415
+ attr_accessor :position
206
416
  end
207
417
  end
@@ -20,7 +20,9 @@ class Traject::JsonWriter
20
20
  @settings = argSettings
21
21
  end
22
22
 
23
- def put(hash)
23
+ def put(context)
24
+ hash = context.output_hash
25
+
24
26
  serialized =
25
27
  if settings["json_writer.pretty_print"]
26
28
  JSON.pretty_generate(hash)
@@ -34,7 +36,7 @@ class Traject::JsonWriter
34
36
  unless defined? @output_file
35
37
  @output_file =
36
38
  if settings["output_file"]
37
- File.open(settings["output_file"])
39
+ File.open(settings["output_file"], 'w:UTF-8')
38
40
  elsif settings["output_stream"]
39
41
  settings["output_stream"]
40
42
  else
@@ -18,6 +18,13 @@ module Traject::Macros
18
18
  # Second arg is optional options, including options valid on MarcExtractor.new,
19
19
  # and others. (TODO)
20
20
  #
21
+ # * :first => true: take only first value
22
+ # * :translation_map => String: translate with named translation map looked up in load
23
+ # path, uses Tranject::TranslationMap.new(translation_map_arg)
24
+ # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
25
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation
26
+ # * :default => String: if otherwise empty, add default value
27
+ #
21
28
  # Examples:
22
29
  #
23
30
  # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
@@ -26,11 +33,12 @@ module Traject::Macros
26
33
  def extract_marc(spec, options = {})
27
34
  only_first = options.delete(:first)
28
35
  trim_punctuation = options.delete(:trim_punctuation)
36
+ default_value = options.delete(:default)
29
37
 
30
38
  # We create the TranslationMap here on load, not inside the closure
31
39
  # where it'll be called for every record. Since TranslationMap is supposed
32
40
  # to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
33
- # exceptions to raise on load.
41
+ # exceptions to raise on load.
34
42
  if translation_map_arg = options.delete(:translation_map)
35
43
  translation_map = Traject::TranslationMap.new(translation_map_arg)
36
44
  end
@@ -49,6 +57,10 @@ module Traject::Macros
49
57
  if trim_punctuation
50
58
  accumulator.collect! {|s| Marc21.trim_punctuation(s)}
51
59
  end
60
+
61
+ if default_value && accumulator.empty?
62
+ accumulator << default_value
63
+ end
52
64
  end
53
65
  end
54
66
 
@@ -97,7 +109,7 @@ module Traject::Macros
97
109
  # All fields in from-to must be marc DATA (not control fields), or weirdness
98
110
  #
99
111
  # Can always run this thing multiple times on the same field if you need
100
- # non-contiguous ranges of fields.
112
+ # non-contiguous ranges of fields.
101
113
  def extract_all_marc_values(options = {})
102
114
  options = {:from => "100", :to => "899", :seperator => ' '}.merge(options)
103
115
 
@@ -123,15 +135,15 @@ module Traject::Macros
123
135
  # pretty simple.
124
136
  #
125
137
  # Removes
126
- # * trailing: comma, slash, semicolon, colon (possibly followed by whitespace)
127
- # * trailing period if it is preceded by at least three letters (possibly followed by whitespace)
138
+ # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
139
+ # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
128
140
  # * single square bracket characters if they are the start and/or end
129
141
  # chars and there are no internal square brackets.
130
142
  #
131
143
  # Returns altered string, doesn't change original arg.
132
144
  def self.trim_punctuation(str)
133
- str = str.sub(/[ ,\/;:] *\Z/, '')
134
- str = str.sub(/(\w\w\w)\. *\Z/, '\1')
145
+ str = str.sub(/ *[ ,\/;:] *\Z/, '')
146
+ str = str.sub(/ *(\w\w\w)\. *\Z/, '\1')
135
147
  str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
136
148
  return str
137
149
  end