traject 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/README.md +183 -191
  4. data/bench/bench.rb +1 -1
  5. data/doc/batch_execution.md +14 -0
  6. data/doc/extending.md +14 -12
  7. data/doc/indexing_rules.md +265 -0
  8. data/lib/traject/command_line.rb +12 -41
  9. data/lib/traject/debug_writer.rb +32 -13
  10. data/lib/traject/indexer.rb +101 -24
  11. data/lib/traject/indexer/settings.rb +18 -17
  12. data/lib/traject/json_writer.rb +32 -11
  13. data/lib/traject/line_writer.rb +6 -6
  14. data/lib/traject/macros/basic.rb +1 -1
  15. data/lib/traject/macros/marc21.rb +17 -13
  16. data/lib/traject/macros/marc21_semantics.rb +27 -25
  17. data/lib/traject/macros/marc_format_classifier.rb +39 -25
  18. data/lib/traject/marc4j_reader.rb +36 -22
  19. data/lib/traject/marc_extractor.rb +79 -75
  20. data/lib/traject/marc_reader.rb +33 -25
  21. data/lib/traject/mock_reader.rb +9 -10
  22. data/lib/traject/ndj_reader.rb +7 -7
  23. data/lib/traject/null_writer.rb +1 -1
  24. data/lib/traject/qualified_const_get.rb +12 -2
  25. data/lib/traject/solrj_writer.rb +61 -52
  26. data/lib/traject/thread_pool.rb +45 -45
  27. data/lib/traject/translation_map.rb +59 -27
  28. data/lib/traject/util.rb +3 -3
  29. data/lib/traject/version.rb +1 -1
  30. data/lib/traject/yaml_writer.rb +1 -1
  31. data/test/debug_writer_test.rb +7 -7
  32. data/test/indexer/each_record_test.rb +4 -4
  33. data/test/indexer/macros_marc21_semantics_test.rb +12 -12
  34. data/test/indexer/macros_marc21_test.rb +10 -10
  35. data/test/indexer/macros_test.rb +1 -1
  36. data/test/indexer/map_record_test.rb +6 -6
  37. data/test/indexer/read_write_test.rb +43 -4
  38. data/test/indexer/settings_test.rb +2 -2
  39. data/test/indexer/to_field_test.rb +8 -8
  40. data/test/marc4j_reader_test.rb +4 -4
  41. data/test/marc_extractor_test.rb +33 -25
  42. data/test/marc_format_classifier_test.rb +3 -3
  43. data/test/marc_reader_test.rb +2 -2
  44. data/test/test_helper.rb +3 -3
  45. data/test/test_support/demo_config.rb +52 -48
  46. data/test/translation_map_test.rb +22 -4
  47. data/test/translation_maps/bad_ruby.rb +2 -2
  48. data/test/translation_maps/both_map.rb +1 -1
  49. data/test/translation_maps/default_literal.rb +1 -1
  50. data/test/translation_maps/default_passthrough.rb +1 -1
  51. data/test/translation_maps/ruby_map.rb +1 -1
  52. metadata +7 -31
  53. data/doc/macros.md +0 -103
@@ -13,19 +13,28 @@ module Traject
13
13
  # or array of strings.
14
14
  #
15
15
  # What makes it more useful than a stunted hash is it's ability to load
16
- # the hash definitions from configuration files, either pure ruby,
17
- # yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
18
- # no = or : in key names, no split lines.)
16
+ # the hash definitions from configuration files, either pure ruby,
17
+ # yaml, or (limited subset of) java .properties file.
19
18
  #
20
- # TranslationMap.new("dir/some_file")
19
+ # traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
20
+ # that will automatically find and use a translation map on the resulting data:
21
21
  #
22
- # Will look for a file named `some_file.rb` or `some_file.yaml` or `some_file.properties`,
23
- # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
24
- # * Looks for "/translation_maps" subdir in load paths, so
22
+ # extract_marc("040a", :translation_map => "languages")
23
+ #
24
+ # Or you can always create one yourself and use it how you like:
25
+ #
26
+ # map = TranslationMap.new("languages")
27
+ #
28
+ # In either case, TranslationMap will look for a file named, in that example,
29
+ # `languages.rb` or `languages.yaml` or `languages.properties`,
30
+ # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
31
+ #
32
+ # * Also looks for "/translation_maps" subdir in load paths, so
25
33
  # for instance you can have a gem that keeps translation maps
26
- # in ./lib/translation_maps, and it Just Works.
34
+ # in ./lib/translation_maps, and it Just Works.
35
+ #
27
36
  # * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
28
- # it'll use whichever it finds (allows calling code to not care which is used).
37
+ # it'll use whichever it finds (allows calling code to not care which is used).
29
38
  #
30
39
  # Ruby files just need to have their last line eval to a hash. They file
31
40
  # will be run through `eval`, don't do it with untrusted content (naturally)
@@ -33,7 +42,7 @@ module Traject
33
42
  # You can also pass in a Hash for consistency to TranslationMap.new, although
34
43
  # I don't know why you'd want to.
35
44
  #
36
- # == Special default handling
45
+ # ## Special default handling
37
46
  #
38
47
  # The key "__default__" in the hash is treated specially. If set to a string,
39
48
  # that string will be returned by the TranslationMap for any input not otherwise
@@ -50,19 +59,20 @@ module Traject
50
59
  # TranslationMap.new("something", :default => "foo")
51
60
  # TranslationMap.new("something", :default => :passthrough)
52
61
  #
53
- # == Output: String or array of strings
62
+ # ## Output: String or array of strings
54
63
  #
55
64
  # The output can be a string or an array of strings, or nil. It should not be anything else.
56
65
  # When used with the #translate_array! method, one string can be replaced by multiple values
57
66
  # (array of strings) or removed (nil)
58
67
  #
59
- # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
68
+ # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
69
+ #
70
+ # ## Caching
60
71
  #
61
- # == Caching
62
72
  # Lookup and loading of configuration files will be cached, for efficiency.
63
73
  # You can reset with `TranslationMap.reset_cache!`
64
74
  #
65
- # == YAML example:
75
+ # ## YAML example:
66
76
  #
67
77
  # key: value
68
78
  # key2: value2 multiple words fine
@@ -71,13 +81,35 @@ module Traject
71
81
  # - array
72
82
  # - of
73
83
  # - values look like this
84
+ #
85
+ # ## Alternatives
86
+ # `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
87
+ # simple key-value stores with optional default values.
88
+ #
89
+ # If you need more complex translation, you can simply use `#map!`
90
+ # or its kin to work on the `accumulator` in a block
91
+ #
92
+ #
93
+ #
94
+ # # get a lousy language detection of any vernacular title
95
+ # require 'whatlanguage'
96
+ # wl = WhatLanguage.new(:all)
97
+ # to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
98
+ # # accumulator is already filled with the values of any 880s that reference a 245 because
99
+ # # of the call to #extract_marc
100
+ # acc.map! {|x| wl.language(x) }
101
+ # acc.uniq!
102
+ # end
103
+ # Within the block, you may also be interested in using:
104
+ # * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
105
+ # * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
74
106
  class TranslationMap
75
107
  class Cache
76
108
  def initialize
77
- @cached = Hash.new
109
+ @cached = Hash.new
78
110
  end
79
111
 
80
- # Returns an actual Hash -- or nil if none found.
112
+ # Returns an actual Hash -- or nil if none found.
81
113
  def lookup(path)
82
114
  unless @cached.has_key?(path)
83
115
  @cached[path] = _lookup!(path)
@@ -86,9 +118,9 @@ module Traject
86
118
  end
87
119
 
88
120
  # force lookup, without using cache.
89
- # used by cache. Returns the actual hash.
90
- # Returns nil if none found.
91
- # May raise on syntax error in file being loaded.
121
+ # used by cache. Returns the actual hash.
122
+ # Returns nil if none found.
123
+ # May raise on syntax error in file being loaded.
92
124
  def _lookup!(path)
93
125
  found = nil
94
126
 
@@ -110,7 +142,7 @@ module Traject
110
142
  end
111
143
 
112
144
  # Cached hash can't be mutated without weird consequences, let's
113
- # freeze it!
145
+ # freeze it!
114
146
  found.freeze if found
115
147
 
116
148
  return found
@@ -163,20 +195,20 @@ module Traject
163
195
  alias_method :map, :[]
164
196
 
165
197
  # Returns a dup of internal hash, dup so you can modify it
166
- # if you like.
198
+ # if you like.
167
199
  def to_hash
168
200
  @hash.dup
169
201
  end
170
202
 
171
203
  # Run every element of an array through this translation map,
172
204
  # return the resulting array. If translation map returns nil,
173
- # original element will be missing from output.
205
+ # original element will be missing from output.
174
206
  #
175
207
  # If an input maps to an array, each element of the array will be flattened
176
- # into the output.
208
+ # into the output.
177
209
  #
178
210
  # If an input maps to nil, it will cause the input element to be removed
179
- # entirely.
211
+ # entirely.
180
212
  def translate_array(array)
181
213
  array.each_with_object([]) do |input_element, output_array|
182
214
  output_element = self.map(input_element)
@@ -200,7 +232,7 @@ module Traject
200
232
 
201
233
  protected
202
234
 
203
- # No built-in way to read java-style .properties, we hack it.
235
+ # No built-in way to read java-style .properties, we hack it.
204
236
  # inspired by various hacky things found google ruby java properties parse
205
237
  # .properties spec seems to be:
206
238
  # http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
@@ -213,10 +245,10 @@ module Traject
213
245
  f.each_line do |line|
214
246
  i += 1
215
247
 
216
- line.strip!
248
+ line.strip!
217
249
 
218
250
  # skip blank lines
219
- next if line.empty?
251
+ next if line.empty?
220
252
 
221
253
  # skip comment lines
222
254
  next if line =~ /^\s*[!\#].*$/
@@ -73,9 +73,9 @@ module Traject
73
73
  end
74
74
 
75
75
  # just does a `require 'java'` but rescues the exception if we
76
- # aren't jruby, and raises a better error message.
76
+ # aren't jruby, and raises a better error message.
77
77
  # Pass in a developer-presentable name of a feature to include in the error
78
- # message if you want.
78
+ # message if you want.
79
79
  def self.jruby_ensure_init!(feature = nil)
80
80
  begin
81
81
  require 'java'
@@ -89,4 +89,4 @@ module Traject
89
89
  end
90
90
 
91
91
  end
92
- end
92
+ end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.16.0"
2
+ VERSION = "0.17.0"
3
3
  end
@@ -6,4 +6,4 @@ class Traject::YamlWriter < Traject::LineWriter
6
6
  context.output_hash.to_yaml(:indentation=>3, :line_width => 78) + "\n\n"
7
7
  end
8
8
  end
9
-
9
+
@@ -15,11 +15,11 @@ describe 'Simple output' do
15
15
  end
16
16
  @io = StringIO.new
17
17
  @writer = Traject::DebugWriter.new("output_stream" => @io)
18
-
18
+
19
19
  @id = "2710183"
20
20
  @title = "Manufacturing consent : the political economy of the mass media /"
21
21
  end
22
-
22
+
23
23
  it "does a simple output" do
24
24
  @writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
25
25
  expected = [
@@ -29,10 +29,10 @@ describe 'Simple output' do
29
29
  ]
30
30
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
31
31
  @writer.close
32
-
32
+
33
33
  end
34
-
34
+
35
35
  end
36
-
37
-
38
-
36
+
37
+
38
+
@@ -42,18 +42,18 @@ describe "Traject::Indexer#each_record" do
42
42
  flunk("Should only fail with a ArityError")
43
43
  end
44
44
  end
45
-
45
+
46
46
  it "rejects each_record with a name (e.g., using a to_field syntax)" do
47
47
  assert_raises(Traject::Indexer::NamingError) do
48
48
  @indexer.each_record('bad_name') {|one, two| }
49
49
  end
50
50
  end
51
-
51
+
52
52
  it "reject each_record with no arguments/blocks at all" do
53
53
  assert_raises(ArgumentError) do
54
54
  @indexer.each_record()
55
55
  end
56
- end
56
+ end
57
57
 
58
58
  end
59
- end
59
+ end
@@ -198,7 +198,7 @@ describe "Traject::Macros::Marc21Semantics" do
198
198
  @record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
199
199
  output = @indexer.map_record(@record)
200
200
 
201
- assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
201
+ assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
202
202
  output["geo_facet"]
203
203
  end
204
204
  it "maps nothing on a record with no geo" do
@@ -221,12 +221,12 @@ describe "Traject::Macros::Marc21Semantics" do
221
221
  end
222
222
 
223
223
  end
224
-
224
+
225
225
  describe "extract_marc_filing_version" do
226
226
  before do
227
227
  @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
228
228
  end
229
-
229
+
230
230
  it "works as expected" do
231
231
  @indexer.instance_eval do
232
232
  to_field 'title_phrase', extract_marc_filing_version('245ab')
@@ -234,7 +234,7 @@ describe "Traject::Macros::Marc21Semantics" do
234
234
  output = @indexer.map_record(@record)
235
235
  assert_equal ['Business renaissance quarterly'], output['title_phrase']
236
236
  end
237
-
237
+
238
238
  it "works with :include_original" do
239
239
  @indexer.instance_eval do
240
240
  to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
@@ -242,7 +242,7 @@ describe "Traject::Macros::Marc21Semantics" do
242
242
  output = @indexer.map_record(@record)
243
243
  assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
244
244
  end
245
-
245
+
246
246
  it "doesn't do anything if you don't include the first subfield" do
247
247
  @indexer.instance_eval do
248
248
  to_field 'title_phrase', extract_marc_filing_version('245h')
@@ -250,8 +250,8 @@ describe "Traject::Macros::Marc21Semantics" do
250
250
  output = @indexer.map_record(@record)
251
251
  assert_equal ['[electronic resource].'], output['title_phrase']
252
252
  end
253
-
254
-
253
+
254
+
255
255
  it "dies if you pass it something else" do
256
256
  assert_raises(RuntimeError) do
257
257
  @indexer.instance_eval do
@@ -259,10 +259,10 @@ describe "Traject::Macros::Marc21Semantics" do
259
259
  end
260
260
  end
261
261
  end
262
-
263
-
262
+
263
+
264
264
  end
265
-
266
-
267
265
 
268
- end
266
+
267
+
268
+ end
@@ -56,22 +56,22 @@ describe "Traject::Macros::Marc21" do
56
56
 
57
57
  assert_equal ["DEFAULT VALUE"], output["only_default"]
58
58
  end
59
-
59
+
60
60
  it "de-duplicates by default, respects :allow_duplicates" do
61
61
  # Add a second 008
62
62
  f = @record.fields('008').first
63
63
  @record.append(f)
64
-
64
+
65
65
  @indexer.instance_eval do
66
66
  to_field "lang1", extract_marc('008[35-37]')
67
- to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
67
+ to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
68
68
  end
69
-
69
+
70
70
  output = @indexer.map_record(@record)
71
71
  assert_equal ["eng"], output['lang1']
72
- assert_equal ["eng", "eng"], output['lang2']
72
+ assert_equal ["eng", "eng"], output['lang2']
73
73
  end
74
-
74
+
75
75
  it "fails on an extra/misspelled argument to extract_marc" do
76
76
  assert_raises(RuntimeError) do
77
77
  @indexer.instance_eval do
@@ -79,9 +79,9 @@ describe "Traject::Macros::Marc21" do
79
79
  end
80
80
  end
81
81
  end
82
-
83
-
84
-
82
+
83
+
84
+
85
85
 
86
86
  it "Marc21::trim_punctuation class method" do
87
87
  assert_equal "one two three", Marc21.trim_punctuation("one two three")
@@ -177,4 +177,4 @@ describe "Traject::Macros::Marc21" do
177
177
  end
178
178
 
179
179
 
180
- end
180
+ end
@@ -37,4 +37,4 @@ describe "Indexer Macros:" do
37
37
 
38
38
 
39
39
 
40
- end
40
+ end
@@ -184,21 +184,21 @@ describe "Traject::Indexer#map_record" do
184
184
 
185
185
  assert called, "Called mapping routine"
186
186
  end
187
-
187
+
188
188
  it "skips records" do
189
-
189
+
190
190
  @indexer.to_field("beforeSkip") do |rec, acc|
191
191
  acc << "Before"
192
192
  end
193
-
193
+
194
194
  @indexer.to_field('radical') do |rec, acc, context|
195
195
  context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
196
196
  end
197
-
197
+
198
198
  @indexer.to_field('afterSkip') do |rec, acc|
199
199
  acc << "After. Should never happen"
200
200
  end
201
-
201
+
202
202
  output = @indexer.map_record(@record)
203
203
  assert_equal ['Before'], output['beforeSkip']
204
204
  assert_nil output['afterSkip']
@@ -206,4 +206,4 @@ describe "Traject::Indexer#map_record" do
206
206
 
207
207
  end
208
208
 
209
- end
209
+ end
@@ -27,9 +27,9 @@ describe "Traject::Indexer#process" do
27
27
  end
28
28
 
29
29
  it "works" do
30
- # oops, this times_called counter isn't thread-safe under multi-threading
30
+ # oops, this times_called counter isn't thread-safe under multi-threading
31
31
  # is why this fails sometimes.
32
- # fixed to be single-threaded for these tests.
32
+ # fixed to be single-threaded for these tests.
33
33
  times_called = 0
34
34
  @indexer.to_field("title") do |record, accumulator, context|
35
35
  times_called += 1
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
68
68
  "solr.url" => "http://example.org",
69
69
  "writer_class_name" => "Traject::SolrJWriter"
70
70
  )
71
- @file = File.open(support_file_path "manufacturing_consent.marc")
71
+ @file = File.open(support_file_path "manufacturing_consent.marc")
72
72
 
73
73
 
74
74
  @indexer.to_field("id") do |record, accumulator|
@@ -80,5 +80,44 @@ describe "Traject::Indexer#process" do
80
80
  assert ! return_value, "returns false on skipped record errors"
81
81
  end
82
82
 
83
+ require 'traject/null_writer'
84
+ it "calls after_processing after processing" do
85
+ @indexer = Traject::Indexer.new(
86
+ "solrj_writer.server_class_name" => "MockSolrServer",
87
+ "solr.url" => "http://example.org",
88
+ "writer_class_name" => "Traject::NullWriter"
89
+ )
90
+ @file = File.open(support_file_path "test_data.utf8.mrc")
91
+
92
+ called = []
93
+
94
+ @indexer.after_processing do
95
+ called << :one
96
+ end
97
+ @indexer.after_processing do
98
+ called << :two
99
+ end
100
+
101
+ @indexer.process(@file)
102
+
103
+ assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
104
+ end
105
+
106
+ describe "demo_config.rb" do
107
+ before do
108
+ @indexer = Traject::Indexer.new(
109
+ "solrj_writer.server_class_name" => "MockSolrServer",
110
+ "solr.url" => "http://example.org",
111
+ "writer_class_name" => "Traject::NullWriter"
112
+ )
113
+ end
114
+
115
+ it "parses and loads" do
116
+ conf_path = support_file_path "demo_config.rb"
117
+ File.open(conf_path) do |file_io|
118
+ @indexer.instance_eval(file_io.read, conf_path)
119
+ end
120
+ end
121
+ end
83
122
 
84
- end
123
+ end