traject 0.16.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/README.md +183 -191
  4. data/bench/bench.rb +1 -1
  5. data/doc/batch_execution.md +14 -0
  6. data/doc/extending.md +14 -12
  7. data/doc/indexing_rules.md +265 -0
  8. data/lib/traject/command_line.rb +12 -41
  9. data/lib/traject/debug_writer.rb +32 -13
  10. data/lib/traject/indexer.rb +101 -24
  11. data/lib/traject/indexer/settings.rb +18 -17
  12. data/lib/traject/json_writer.rb +32 -11
  13. data/lib/traject/line_writer.rb +6 -6
  14. data/lib/traject/macros/basic.rb +1 -1
  15. data/lib/traject/macros/marc21.rb +17 -13
  16. data/lib/traject/macros/marc21_semantics.rb +27 -25
  17. data/lib/traject/macros/marc_format_classifier.rb +39 -25
  18. data/lib/traject/marc4j_reader.rb +36 -22
  19. data/lib/traject/marc_extractor.rb +79 -75
  20. data/lib/traject/marc_reader.rb +33 -25
  21. data/lib/traject/mock_reader.rb +9 -10
  22. data/lib/traject/ndj_reader.rb +7 -7
  23. data/lib/traject/null_writer.rb +1 -1
  24. data/lib/traject/qualified_const_get.rb +12 -2
  25. data/lib/traject/solrj_writer.rb +61 -52
  26. data/lib/traject/thread_pool.rb +45 -45
  27. data/lib/traject/translation_map.rb +59 -27
  28. data/lib/traject/util.rb +3 -3
  29. data/lib/traject/version.rb +1 -1
  30. data/lib/traject/yaml_writer.rb +1 -1
  31. data/test/debug_writer_test.rb +7 -7
  32. data/test/indexer/each_record_test.rb +4 -4
  33. data/test/indexer/macros_marc21_semantics_test.rb +12 -12
  34. data/test/indexer/macros_marc21_test.rb +10 -10
  35. data/test/indexer/macros_test.rb +1 -1
  36. data/test/indexer/map_record_test.rb +6 -6
  37. data/test/indexer/read_write_test.rb +43 -4
  38. data/test/indexer/settings_test.rb +2 -2
  39. data/test/indexer/to_field_test.rb +8 -8
  40. data/test/marc4j_reader_test.rb +4 -4
  41. data/test/marc_extractor_test.rb +33 -25
  42. data/test/marc_format_classifier_test.rb +3 -3
  43. data/test/marc_reader_test.rb +2 -2
  44. data/test/test_helper.rb +3 -3
  45. data/test/test_support/demo_config.rb +52 -48
  46. data/test/translation_map_test.rb +22 -4
  47. data/test/translation_maps/bad_ruby.rb +2 -2
  48. data/test/translation_maps/both_map.rb +1 -1
  49. data/test/translation_maps/default_literal.rb +1 -1
  50. data/test/translation_maps/default_passthrough.rb +1 -1
  51. data/test/translation_maps/ruby_map.rb +1 -1
  52. metadata +7 -31
  53. data/doc/macros.md +0 -103
@@ -13,19 +13,28 @@ module Traject
13
13
  # or array of strings.
14
14
  #
15
15
  # What makes it more useful than a stunted hash is it's ability to load
16
- # the hash definitions from configuration files, either pure ruby,
17
- # yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
18
- # no = or : in key names, no split lines.)
16
+ # the hash definitions from configuration files, either pure ruby,
17
+ # yaml, or (limited subset of) java .properties file.
19
18
  #
20
- # TranslationMap.new("dir/some_file")
19
+ # traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
20
+ # that will automatically find and use a translation map on the resulting data:
21
21
  #
22
- # Will look for a file named `some_file.rb` or `some_file.yaml` or `some_file.properties`,
23
- # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
24
- # * Looks for "/translation_maps" subdir in load paths, so
22
+ # extract_marc("040a", :translation_map => "languages")
23
+ #
24
+ # Or you can always create one yourself and use it how you like:
25
+ #
26
+ # map = TranslationMap.new("languages")
27
+ #
28
+ # In either case, TranslationMap will look for a file named, in that example,
29
+ # `languages.rb` or `languages.yaml` or `languages.properties`,
30
+ # somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
31
+ #
32
+ # * Also looks for "/translation_maps" subdir in load paths, so
25
33
  # for instance you can have a gem that keeps translation maps
26
- # in ./lib/translation_maps, and it Just Works.
34
+ # in ./lib/translation_maps, and it Just Works.
35
+ #
27
36
  # * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
28
- # it'll use whichever it finds (allows calling code to not care which is used).
37
+ # it'll use whichever it finds (allows calling code to not care which is used).
29
38
  #
30
39
  # Ruby files just need to have their last line eval to a hash. They file
31
40
  # will be run through `eval`, don't do it with untrusted content (naturally)
@@ -33,7 +42,7 @@ module Traject
33
42
  # You can also pass in a Hash for consistency to TranslationMap.new, although
34
43
  # I don't know why you'd want to.
35
44
  #
36
- # == Special default handling
45
+ # ## Special default handling
37
46
  #
38
47
  # The key "__default__" in the hash is treated specially. If set to a string,
39
48
  # that string will be returned by the TranslationMap for any input not otherwise
@@ -50,19 +59,20 @@ module Traject
50
59
  # TranslationMap.new("something", :default => "foo")
51
60
  # TranslationMap.new("something", :default => :passthrough)
52
61
  #
53
- # == Output: String or array of strings
62
+ # ## Output: String or array of strings
54
63
  #
55
64
  # The output can be a string or an array of strings, or nil. It should not be anything else.
56
65
  # When used with the #translate_array! method, one string can be replaced by multiple values
57
66
  # (array of strings) or removed (nil)
58
67
  #
59
- # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
68
+ # There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
69
+ #
70
+ # ## Caching
60
71
  #
61
- # == Caching
62
72
  # Lookup and loading of configuration files will be cached, for efficiency.
63
73
  # You can reset with `TranslationMap.reset_cache!`
64
74
  #
65
- # == YAML example:
75
+ # ## YAML example:
66
76
  #
67
77
  # key: value
68
78
  # key2: value2 multiple words fine
@@ -71,13 +81,35 @@ module Traject
71
81
  # - array
72
82
  # - of
73
83
  # - values look like this
84
+ #
85
+ # ## Alternatives
86
+ # `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
87
+ # simple key-value stores with optional default values.
88
+ #
89
+ # If you need more complex translation, you can simply use `#map!`
90
+ # or its kin to work on the `accumulator` in a block
91
+ #
92
+ #
93
+ #
94
+ # # get a lousy language detection of any vernacular title
95
+ # require 'whatlanguage'
96
+ # wl = WhatLanguage.new(:all)
97
+ # to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
98
+ # # accumulator is already filled with the values of any 880s that reference a 245 because
99
+ # # of the call to #extract_marc
100
+ # acc.map! {|x| wl.language(x) }
101
+ # acc.uniq!
102
+ # end
103
+ # Within the block, you may also be interested in using:
104
+ # * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
105
+ # * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
74
106
  class TranslationMap
75
107
  class Cache
76
108
  def initialize
77
- @cached = Hash.new
109
+ @cached = Hash.new
78
110
  end
79
111
 
80
- # Returns an actual Hash -- or nil if none found.
112
+ # Returns an actual Hash -- or nil if none found.
81
113
  def lookup(path)
82
114
  unless @cached.has_key?(path)
83
115
  @cached[path] = _lookup!(path)
@@ -86,9 +118,9 @@ module Traject
86
118
  end
87
119
 
88
120
  # force lookup, without using cache.
89
- # used by cache. Returns the actual hash.
90
- # Returns nil if none found.
91
- # May raise on syntax error in file being loaded.
121
+ # used by cache. Returns the actual hash.
122
+ # Returns nil if none found.
123
+ # May raise on syntax error in file being loaded.
92
124
  def _lookup!(path)
93
125
  found = nil
94
126
 
@@ -110,7 +142,7 @@ module Traject
110
142
  end
111
143
 
112
144
  # Cached hash can't be mutated without weird consequences, let's
113
- # freeze it!
145
+ # freeze it!
114
146
  found.freeze if found
115
147
 
116
148
  return found
@@ -163,20 +195,20 @@ module Traject
163
195
  alias_method :map, :[]
164
196
 
165
197
  # Returns a dup of internal hash, dup so you can modify it
166
- # if you like.
198
+ # if you like.
167
199
  def to_hash
168
200
  @hash.dup
169
201
  end
170
202
 
171
203
  # Run every element of an array through this translation map,
172
204
  # return the resulting array. If translation map returns nil,
173
- # original element will be missing from output.
205
+ # original element will be missing from output.
174
206
  #
175
207
  # If an input maps to an array, each element of the array will be flattened
176
- # into the output.
208
+ # into the output.
177
209
  #
178
210
  # If an input maps to nil, it will cause the input element to be removed
179
- # entirely.
211
+ # entirely.
180
212
  def translate_array(array)
181
213
  array.each_with_object([]) do |input_element, output_array|
182
214
  output_element = self.map(input_element)
@@ -200,7 +232,7 @@ module Traject
200
232
 
201
233
  protected
202
234
 
203
- # No built-in way to read java-style .properties, we hack it.
235
+ # No built-in way to read java-style .properties, we hack it.
204
236
  # inspired by various hacky things found google ruby java properties parse
205
237
  # .properties spec seems to be:
206
238
  # http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
@@ -213,10 +245,10 @@ module Traject
213
245
  f.each_line do |line|
214
246
  i += 1
215
247
 
216
- line.strip!
248
+ line.strip!
217
249
 
218
250
  # skip blank lines
219
- next if line.empty?
251
+ next if line.empty?
220
252
 
221
253
  # skip comment lines
222
254
  next if line =~ /^\s*[!\#].*$/
@@ -73,9 +73,9 @@ module Traject
73
73
  end
74
74
 
75
75
  # just does a `require 'java'` but rescues the exception if we
76
- # aren't jruby, and raises a better error message.
76
+ # aren't jruby, and raises a better error message.
77
77
  # Pass in a developer-presentable name of a feature to include in the error
78
- # message if you want.
78
+ # message if you want.
79
79
  def self.jruby_ensure_init!(feature = nil)
80
80
  begin
81
81
  require 'java'
@@ -89,4 +89,4 @@ module Traject
89
89
  end
90
90
 
91
91
  end
92
- end
92
+ end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.16.0"
2
+ VERSION = "0.17.0"
3
3
  end
@@ -6,4 +6,4 @@ class Traject::YamlWriter < Traject::LineWriter
6
6
  context.output_hash.to_yaml(:indentation=>3, :line_width => 78) + "\n\n"
7
7
  end
8
8
  end
9
-
9
+
@@ -15,11 +15,11 @@ describe 'Simple output' do
15
15
  end
16
16
  @io = StringIO.new
17
17
  @writer = Traject::DebugWriter.new("output_stream" => @io)
18
-
18
+
19
19
  @id = "2710183"
20
20
  @title = "Manufacturing consent : the political economy of the mass media /"
21
21
  end
22
-
22
+
23
23
  it "does a simple output" do
24
24
  @writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
25
25
  expected = [
@@ -29,10 +29,10 @@ describe 'Simple output' do
29
29
  ]
30
30
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
31
31
  @writer.close
32
-
32
+
33
33
  end
34
-
34
+
35
35
  end
36
-
37
-
38
-
36
+
37
+
38
+
@@ -42,18 +42,18 @@ describe "Traject::Indexer#each_record" do
42
42
  flunk("Should only fail with a ArityError")
43
43
  end
44
44
  end
45
-
45
+
46
46
  it "rejects each_record with a name (e.g., using a to_field syntax)" do
47
47
  assert_raises(Traject::Indexer::NamingError) do
48
48
  @indexer.each_record('bad_name') {|one, two| }
49
49
  end
50
50
  end
51
-
51
+
52
52
  it "reject each_record with no arguments/blocks at all" do
53
53
  assert_raises(ArgumentError) do
54
54
  @indexer.each_record()
55
55
  end
56
- end
56
+ end
57
57
 
58
58
  end
59
- end
59
+ end
@@ -198,7 +198,7 @@ describe "Traject::Macros::Marc21Semantics" do
198
198
  @record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
199
199
  output = @indexer.map_record(@record)
200
200
 
201
- assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
201
+ assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
202
202
  output["geo_facet"]
203
203
  end
204
204
  it "maps nothing on a record with no geo" do
@@ -221,12 +221,12 @@ describe "Traject::Macros::Marc21Semantics" do
221
221
  end
222
222
 
223
223
  end
224
-
224
+
225
225
  describe "extract_marc_filing_version" do
226
226
  before do
227
227
  @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
228
228
  end
229
-
229
+
230
230
  it "works as expected" do
231
231
  @indexer.instance_eval do
232
232
  to_field 'title_phrase', extract_marc_filing_version('245ab')
@@ -234,7 +234,7 @@ describe "Traject::Macros::Marc21Semantics" do
234
234
  output = @indexer.map_record(@record)
235
235
  assert_equal ['Business renaissance quarterly'], output['title_phrase']
236
236
  end
237
-
237
+
238
238
  it "works with :include_original" do
239
239
  @indexer.instance_eval do
240
240
  to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
@@ -242,7 +242,7 @@ describe "Traject::Macros::Marc21Semantics" do
242
242
  output = @indexer.map_record(@record)
243
243
  assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
244
244
  end
245
-
245
+
246
246
  it "doesn't do anything if you don't include the first subfield" do
247
247
  @indexer.instance_eval do
248
248
  to_field 'title_phrase', extract_marc_filing_version('245h')
@@ -250,8 +250,8 @@ describe "Traject::Macros::Marc21Semantics" do
250
250
  output = @indexer.map_record(@record)
251
251
  assert_equal ['[electronic resource].'], output['title_phrase']
252
252
  end
253
-
254
-
253
+
254
+
255
255
  it "dies if you pass it something else" do
256
256
  assert_raises(RuntimeError) do
257
257
  @indexer.instance_eval do
@@ -259,10 +259,10 @@ describe "Traject::Macros::Marc21Semantics" do
259
259
  end
260
260
  end
261
261
  end
262
-
263
-
262
+
263
+
264
264
  end
265
-
266
-
267
265
 
268
- end
266
+
267
+
268
+ end
@@ -56,22 +56,22 @@ describe "Traject::Macros::Marc21" do
56
56
 
57
57
  assert_equal ["DEFAULT VALUE"], output["only_default"]
58
58
  end
59
-
59
+
60
60
  it "de-duplicates by default, respects :allow_duplicates" do
61
61
  # Add a second 008
62
62
  f = @record.fields('008').first
63
63
  @record.append(f)
64
-
64
+
65
65
  @indexer.instance_eval do
66
66
  to_field "lang1", extract_marc('008[35-37]')
67
- to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
67
+ to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
68
68
  end
69
-
69
+
70
70
  output = @indexer.map_record(@record)
71
71
  assert_equal ["eng"], output['lang1']
72
- assert_equal ["eng", "eng"], output['lang2']
72
+ assert_equal ["eng", "eng"], output['lang2']
73
73
  end
74
-
74
+
75
75
  it "fails on an extra/misspelled argument to extract_marc" do
76
76
  assert_raises(RuntimeError) do
77
77
  @indexer.instance_eval do
@@ -79,9 +79,9 @@ describe "Traject::Macros::Marc21" do
79
79
  end
80
80
  end
81
81
  end
82
-
83
-
84
-
82
+
83
+
84
+
85
85
 
86
86
  it "Marc21::trim_punctuation class method" do
87
87
  assert_equal "one two three", Marc21.trim_punctuation("one two three")
@@ -177,4 +177,4 @@ describe "Traject::Macros::Marc21" do
177
177
  end
178
178
 
179
179
 
180
- end
180
+ end
@@ -37,4 +37,4 @@ describe "Indexer Macros:" do
37
37
 
38
38
 
39
39
 
40
- end
40
+ end
@@ -184,21 +184,21 @@ describe "Traject::Indexer#map_record" do
184
184
 
185
185
  assert called, "Called mapping routine"
186
186
  end
187
-
187
+
188
188
  it "skips records" do
189
-
189
+
190
190
  @indexer.to_field("beforeSkip") do |rec, acc|
191
191
  acc << "Before"
192
192
  end
193
-
193
+
194
194
  @indexer.to_field('radical') do |rec, acc, context|
195
195
  context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
196
196
  end
197
-
197
+
198
198
  @indexer.to_field('afterSkip') do |rec, acc|
199
199
  acc << "After. Should never happen"
200
200
  end
201
-
201
+
202
202
  output = @indexer.map_record(@record)
203
203
  assert_equal ['Before'], output['beforeSkip']
204
204
  assert_nil output['afterSkip']
@@ -206,4 +206,4 @@ describe "Traject::Indexer#map_record" do
206
206
 
207
207
  end
208
208
 
209
- end
209
+ end
@@ -27,9 +27,9 @@ describe "Traject::Indexer#process" do
27
27
  end
28
28
 
29
29
  it "works" do
30
- # oops, this times_called counter isn't thread-safe under multi-threading
30
+ # oops, this times_called counter isn't thread-safe under multi-threading
31
31
  # is why this fails sometimes.
32
- # fixed to be single-threaded for these tests.
32
+ # fixed to be single-threaded for these tests.
33
33
  times_called = 0
34
34
  @indexer.to_field("title") do |record, accumulator, context|
35
35
  times_called += 1
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
68
68
  "solr.url" => "http://example.org",
69
69
  "writer_class_name" => "Traject::SolrJWriter"
70
70
  )
71
- @file = File.open(support_file_path "manufacturing_consent.marc")
71
+ @file = File.open(support_file_path "manufacturing_consent.marc")
72
72
 
73
73
 
74
74
  @indexer.to_field("id") do |record, accumulator|
@@ -80,5 +80,44 @@ describe "Traject::Indexer#process" do
80
80
  assert ! return_value, "returns false on skipped record errors"
81
81
  end
82
82
 
83
+ require 'traject/null_writer'
84
+ it "calls after_processing after processing" do
85
+ @indexer = Traject::Indexer.new(
86
+ "solrj_writer.server_class_name" => "MockSolrServer",
87
+ "solr.url" => "http://example.org",
88
+ "writer_class_name" => "Traject::NullWriter"
89
+ )
90
+ @file = File.open(support_file_path "test_data.utf8.mrc")
91
+
92
+ called = []
93
+
94
+ @indexer.after_processing do
95
+ called << :one
96
+ end
97
+ @indexer.after_processing do
98
+ called << :two
99
+ end
100
+
101
+ @indexer.process(@file)
102
+
103
+ assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
104
+ end
105
+
106
+ describe "demo_config.rb" do
107
+ before do
108
+ @indexer = Traject::Indexer.new(
109
+ "solrj_writer.server_class_name" => "MockSolrServer",
110
+ "solr.url" => "http://example.org",
111
+ "writer_class_name" => "Traject::NullWriter"
112
+ )
113
+ end
114
+
115
+ it "parses and loads" do
116
+ conf_path = support_file_path "demo_config.rb"
117
+ File.open(conf_path) do |file_io|
118
+ @indexer.instance_eval(file_io.read, conf_path)
119
+ end
120
+ end
121
+ end
83
122
 
84
- end
123
+ end