traject 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
@@ -13,19 +13,28 @@ module Traject
|
|
13
13
|
# or array of strings.
|
14
14
|
#
|
15
15
|
# What makes it more useful than a stunted hash is it's ability to load
|
16
|
-
# the hash definitions from configuration files, either pure ruby,
|
17
|
-
# yaml, or java .properties
|
18
|
-
# no = or : in key names, no split lines.)
|
16
|
+
# the hash definitions from configuration files, either pure ruby,
|
17
|
+
# yaml, or (limited subset of) java .properties file.
|
19
18
|
#
|
20
|
-
#
|
19
|
+
# traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
|
20
|
+
# that will automatically find and use a translation map on the resulting data:
|
21
21
|
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
22
|
+
# extract_marc("040a", :translation_map => "languages")
|
23
|
+
#
|
24
|
+
# Or you can always create one yourself and use it how you like:
|
25
|
+
#
|
26
|
+
# map = TranslationMap.new("languages")
|
27
|
+
#
|
28
|
+
# In either case, TranslationMap will look for a file named, in that example,
|
29
|
+
# `languages.rb` or `languages.yaml` or `languages.properties`,
|
30
|
+
# somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
|
31
|
+
#
|
32
|
+
# * Also looks for "/translation_maps" subdir in load paths, so
|
25
33
|
# for instance you can have a gem that keeps translation maps
|
26
|
-
# in ./lib/translation_maps, and it Just Works.
|
34
|
+
# in ./lib/translation_maps, and it Just Works.
|
35
|
+
#
|
27
36
|
# * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
|
28
|
-
#
|
37
|
+
# it'll use whichever it finds (allows calling code to not care which is used).
|
29
38
|
#
|
30
39
|
# Ruby files just need to have their last line eval to a hash. They file
|
31
40
|
# will be run through `eval`, don't do it with untrusted content (naturally)
|
@@ -33,7 +42,7 @@ module Traject
|
|
33
42
|
# You can also pass in a Hash for consistency to TranslationMap.new, although
|
34
43
|
# I don't know why you'd want to.
|
35
44
|
#
|
36
|
-
#
|
45
|
+
# ## Special default handling
|
37
46
|
#
|
38
47
|
# The key "__default__" in the hash is treated specially. If set to a string,
|
39
48
|
# that string will be returned by the TranslationMap for any input not otherwise
|
@@ -50,19 +59,20 @@ module Traject
|
|
50
59
|
# TranslationMap.new("something", :default => "foo")
|
51
60
|
# TranslationMap.new("something", :default => :passthrough)
|
52
61
|
#
|
53
|
-
#
|
62
|
+
# ## Output: String or array of strings
|
54
63
|
#
|
55
64
|
# The output can be a string or an array of strings, or nil. It should not be anything else.
|
56
65
|
# When used with the #translate_array! method, one string can be replaced by multiple values
|
57
66
|
# (array of strings) or removed (nil)
|
58
67
|
#
|
59
|
-
# There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
|
68
|
+
# There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
|
69
|
+
#
|
70
|
+
# ## Caching
|
60
71
|
#
|
61
|
-
# == Caching
|
62
72
|
# Lookup and loading of configuration files will be cached, for efficiency.
|
63
73
|
# You can reset with `TranslationMap.reset_cache!`
|
64
74
|
#
|
65
|
-
#
|
75
|
+
# ## YAML example:
|
66
76
|
#
|
67
77
|
# key: value
|
68
78
|
# key2: value2 multiple words fine
|
@@ -71,13 +81,35 @@ module Traject
|
|
71
81
|
# - array
|
72
82
|
# - of
|
73
83
|
# - values look like this
|
84
|
+
#
|
85
|
+
# ## Alternatives
|
86
|
+
# `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
|
87
|
+
# simple key-value stores with optional default values.
|
88
|
+
#
|
89
|
+
# If you need more complex translation, you can simply use `#map!`
|
90
|
+
# or its kin to work on the `accumulator` in a block
|
91
|
+
#
|
92
|
+
#
|
93
|
+
#
|
94
|
+
# # get a lousy language detection of any vernacular title
|
95
|
+
# require 'whatlanguage'
|
96
|
+
# wl = WhatLanguage.new(:all)
|
97
|
+
# to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
|
98
|
+
# # accumulator is already filled with the values of any 880s that reference a 245 because
|
99
|
+
# # of the call to #extract_marc
|
100
|
+
# acc.map! {|x| wl.language(x) }
|
101
|
+
# acc.uniq!
|
102
|
+
# end
|
103
|
+
# Within the block, you may also be interested in using:
|
104
|
+
# * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
|
105
|
+
# * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
|
74
106
|
class TranslationMap
|
75
107
|
class Cache
|
76
108
|
def initialize
|
77
|
-
@cached = Hash.new
|
109
|
+
@cached = Hash.new
|
78
110
|
end
|
79
111
|
|
80
|
-
# Returns an actual Hash -- or nil if none found.
|
112
|
+
# Returns an actual Hash -- or nil if none found.
|
81
113
|
def lookup(path)
|
82
114
|
unless @cached.has_key?(path)
|
83
115
|
@cached[path] = _lookup!(path)
|
@@ -86,9 +118,9 @@ module Traject
|
|
86
118
|
end
|
87
119
|
|
88
120
|
# force lookup, without using cache.
|
89
|
-
# used by cache. Returns the actual hash.
|
90
|
-
# Returns nil if none found.
|
91
|
-
# May raise on syntax error in file being loaded.
|
121
|
+
# used by cache. Returns the actual hash.
|
122
|
+
# Returns nil if none found.
|
123
|
+
# May raise on syntax error in file being loaded.
|
92
124
|
def _lookup!(path)
|
93
125
|
found = nil
|
94
126
|
|
@@ -110,7 +142,7 @@ module Traject
|
|
110
142
|
end
|
111
143
|
|
112
144
|
# Cached hash can't be mutated without weird consequences, let's
|
113
|
-
# freeze it!
|
145
|
+
# freeze it!
|
114
146
|
found.freeze if found
|
115
147
|
|
116
148
|
return found
|
@@ -163,20 +195,20 @@ module Traject
|
|
163
195
|
alias_method :map, :[]
|
164
196
|
|
165
197
|
# Returns a dup of internal hash, dup so you can modify it
|
166
|
-
# if you like.
|
198
|
+
# if you like.
|
167
199
|
def to_hash
|
168
200
|
@hash.dup
|
169
201
|
end
|
170
202
|
|
171
203
|
# Run every element of an array through this translation map,
|
172
204
|
# return the resulting array. If translation map returns nil,
|
173
|
-
# original element will be missing from output.
|
205
|
+
# original element will be missing from output.
|
174
206
|
#
|
175
207
|
# If an input maps to an array, each element of the array will be flattened
|
176
|
-
# into the output.
|
208
|
+
# into the output.
|
177
209
|
#
|
178
210
|
# If an input maps to nil, it will cause the input element to be removed
|
179
|
-
# entirely.
|
211
|
+
# entirely.
|
180
212
|
def translate_array(array)
|
181
213
|
array.each_with_object([]) do |input_element, output_array|
|
182
214
|
output_element = self.map(input_element)
|
@@ -200,7 +232,7 @@ module Traject
|
|
200
232
|
|
201
233
|
protected
|
202
234
|
|
203
|
-
# No built-in way to read java-style .properties, we hack it.
|
235
|
+
# No built-in way to read java-style .properties, we hack it.
|
204
236
|
# inspired by various hacky things found google ruby java properties parse
|
205
237
|
# .properties spec seems to be:
|
206
238
|
# http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
|
@@ -213,10 +245,10 @@ module Traject
|
|
213
245
|
f.each_line do |line|
|
214
246
|
i += 1
|
215
247
|
|
216
|
-
line.strip!
|
248
|
+
line.strip!
|
217
249
|
|
218
250
|
# skip blank lines
|
219
|
-
next if line.empty?
|
251
|
+
next if line.empty?
|
220
252
|
|
221
253
|
# skip comment lines
|
222
254
|
next if line =~ /^\s*[!\#].*$/
|
data/lib/traject/util.rb
CHANGED
@@ -73,9 +73,9 @@ module Traject
|
|
73
73
|
end
|
74
74
|
|
75
75
|
# just does a `require 'java'` but rescues the exception if we
|
76
|
-
# aren't jruby, and raises a better error message.
|
76
|
+
# aren't jruby, and raises a better error message.
|
77
77
|
# Pass in a developer-presentable name of a feature to include in the error
|
78
|
-
# message if you want.
|
78
|
+
# message if you want.
|
79
79
|
def self.jruby_ensure_init!(feature = nil)
|
80
80
|
begin
|
81
81
|
require 'java'
|
@@ -89,4 +89,4 @@ module Traject
|
|
89
89
|
end
|
90
90
|
|
91
91
|
end
|
92
|
-
end
|
92
|
+
end
|
data/lib/traject/version.rb
CHANGED
data/lib/traject/yaml_writer.rb
CHANGED
data/test/debug_writer_test.rb
CHANGED
@@ -15,11 +15,11 @@ describe 'Simple output' do
|
|
15
15
|
end
|
16
16
|
@io = StringIO.new
|
17
17
|
@writer = Traject::DebugWriter.new("output_stream" => @io)
|
18
|
-
|
18
|
+
|
19
19
|
@id = "2710183"
|
20
20
|
@title = "Manufacturing consent : the political economy of the mass media /"
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
it "does a simple output" do
|
24
24
|
@writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
25
25
|
expected = [
|
@@ -29,10 +29,10 @@ describe 'Simple output' do
|
|
29
29
|
]
|
30
30
|
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
31
31
|
@writer.close
|
32
|
-
|
32
|
+
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -42,18 +42,18 @@ describe "Traject::Indexer#each_record" do
|
|
42
42
|
flunk("Should only fail with a ArityError")
|
43
43
|
end
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it "rejects each_record with a name (e.g., using a to_field syntax)" do
|
47
47
|
assert_raises(Traject::Indexer::NamingError) do
|
48
48
|
@indexer.each_record('bad_name') {|one, two| }
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
it "reject each_record with no arguments/blocks at all" do
|
53
53
|
assert_raises(ArgumentError) do
|
54
54
|
@indexer.each_record()
|
55
55
|
end
|
56
|
-
end
|
56
|
+
end
|
57
57
|
|
58
58
|
end
|
59
|
-
end
|
59
|
+
end
|
@@ -198,7 +198,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
198
198
|
@record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
|
199
199
|
output = @indexer.map_record(@record)
|
200
200
|
|
201
|
-
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
|
201
|
+
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
|
202
202
|
output["geo_facet"]
|
203
203
|
end
|
204
204
|
it "maps nothing on a record with no geo" do
|
@@ -221,12 +221,12 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
221
221
|
end
|
222
222
|
|
223
223
|
end
|
224
|
-
|
224
|
+
|
225
225
|
describe "extract_marc_filing_version" do
|
226
226
|
before do
|
227
227
|
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
228
228
|
end
|
229
|
-
|
229
|
+
|
230
230
|
it "works as expected" do
|
231
231
|
@indexer.instance_eval do
|
232
232
|
to_field 'title_phrase', extract_marc_filing_version('245ab')
|
@@ -234,7 +234,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
234
234
|
output = @indexer.map_record(@record)
|
235
235
|
assert_equal ['Business renaissance quarterly'], output['title_phrase']
|
236
236
|
end
|
237
|
-
|
237
|
+
|
238
238
|
it "works with :include_original" do
|
239
239
|
@indexer.instance_eval do
|
240
240
|
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
|
@@ -242,7 +242,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
242
242
|
output = @indexer.map_record(@record)
|
243
243
|
assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
|
244
244
|
end
|
245
|
-
|
245
|
+
|
246
246
|
it "doesn't do anything if you don't include the first subfield" do
|
247
247
|
@indexer.instance_eval do
|
248
248
|
to_field 'title_phrase', extract_marc_filing_version('245h')
|
@@ -250,8 +250,8 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
250
250
|
output = @indexer.map_record(@record)
|
251
251
|
assert_equal ['[electronic resource].'], output['title_phrase']
|
252
252
|
end
|
253
|
-
|
254
|
-
|
253
|
+
|
254
|
+
|
255
255
|
it "dies if you pass it something else" do
|
256
256
|
assert_raises(RuntimeError) do
|
257
257
|
@indexer.instance_eval do
|
@@ -259,10 +259,10 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
259
259
|
end
|
260
260
|
end
|
261
261
|
end
|
262
|
-
|
263
|
-
|
262
|
+
|
263
|
+
|
264
264
|
end
|
265
|
-
|
266
|
-
|
267
265
|
|
268
|
-
|
266
|
+
|
267
|
+
|
268
|
+
end
|
@@ -56,22 +56,22 @@ describe "Traject::Macros::Marc21" do
|
|
56
56
|
|
57
57
|
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
it "de-duplicates by default, respects :allow_duplicates" do
|
61
61
|
# Add a second 008
|
62
62
|
f = @record.fields('008').first
|
63
63
|
@record.append(f)
|
64
|
-
|
64
|
+
|
65
65
|
@indexer.instance_eval do
|
66
66
|
to_field "lang1", extract_marc('008[35-37]')
|
67
|
-
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
|
67
|
+
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
output = @indexer.map_record(@record)
|
71
71
|
assert_equal ["eng"], output['lang1']
|
72
|
-
assert_equal ["eng", "eng"], output['lang2']
|
72
|
+
assert_equal ["eng", "eng"], output['lang2']
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
it "fails on an extra/misspelled argument to extract_marc" do
|
76
76
|
assert_raises(RuntimeError) do
|
77
77
|
@indexer.instance_eval do
|
@@ -79,9 +79,9 @@ describe "Traject::Macros::Marc21" do
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
|
83
|
+
|
84
|
+
|
85
85
|
|
86
86
|
it "Marc21::trim_punctuation class method" do
|
87
87
|
assert_equal "one two three", Marc21.trim_punctuation("one two three")
|
@@ -177,4 +177,4 @@ describe "Traject::Macros::Marc21" do
|
|
177
177
|
end
|
178
178
|
|
179
179
|
|
180
|
-
end
|
180
|
+
end
|
data/test/indexer/macros_test.rb
CHANGED
@@ -184,21 +184,21 @@ describe "Traject::Indexer#map_record" do
|
|
184
184
|
|
185
185
|
assert called, "Called mapping routine"
|
186
186
|
end
|
187
|
-
|
187
|
+
|
188
188
|
it "skips records" do
|
189
|
-
|
189
|
+
|
190
190
|
@indexer.to_field("beforeSkip") do |rec, acc|
|
191
191
|
acc << "Before"
|
192
192
|
end
|
193
|
-
|
193
|
+
|
194
194
|
@indexer.to_field('radical') do |rec, acc, context|
|
195
195
|
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
196
|
end
|
197
|
-
|
197
|
+
|
198
198
|
@indexer.to_field('afterSkip') do |rec, acc|
|
199
199
|
acc << "After. Should never happen"
|
200
200
|
end
|
201
|
-
|
201
|
+
|
202
202
|
output = @indexer.map_record(@record)
|
203
203
|
assert_equal ['Before'], output['beforeSkip']
|
204
204
|
assert_nil output['afterSkip']
|
@@ -206,4 +206,4 @@ describe "Traject::Indexer#map_record" do
|
|
206
206
|
|
207
207
|
end
|
208
208
|
|
209
|
-
end
|
209
|
+
end
|
@@ -27,9 +27,9 @@ describe "Traject::Indexer#process" do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it "works" do
|
30
|
-
# oops, this times_called counter isn't thread-safe under multi-threading
|
30
|
+
# oops, this times_called counter isn't thread-safe under multi-threading
|
31
31
|
# is why this fails sometimes.
|
32
|
-
# fixed to be single-threaded for these tests.
|
32
|
+
# fixed to be single-threaded for these tests.
|
33
33
|
times_called = 0
|
34
34
|
@indexer.to_field("title") do |record, accumulator, context|
|
35
35
|
times_called += 1
|
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
|
|
68
68
|
"solr.url" => "http://example.org",
|
69
69
|
"writer_class_name" => "Traject::SolrJWriter"
|
70
70
|
)
|
71
|
-
@file = File.open(support_file_path "manufacturing_consent.marc")
|
71
|
+
@file = File.open(support_file_path "manufacturing_consent.marc")
|
72
72
|
|
73
73
|
|
74
74
|
@indexer.to_field("id") do |record, accumulator|
|
@@ -80,5 +80,44 @@ describe "Traject::Indexer#process" do
|
|
80
80
|
assert ! return_value, "returns false on skipped record errors"
|
81
81
|
end
|
82
82
|
|
83
|
+
require 'traject/null_writer'
|
84
|
+
it "calls after_processing after processing" do
|
85
|
+
@indexer = Traject::Indexer.new(
|
86
|
+
"solrj_writer.server_class_name" => "MockSolrServer",
|
87
|
+
"solr.url" => "http://example.org",
|
88
|
+
"writer_class_name" => "Traject::NullWriter"
|
89
|
+
)
|
90
|
+
@file = File.open(support_file_path "test_data.utf8.mrc")
|
91
|
+
|
92
|
+
called = []
|
93
|
+
|
94
|
+
@indexer.after_processing do
|
95
|
+
called << :one
|
96
|
+
end
|
97
|
+
@indexer.after_processing do
|
98
|
+
called << :two
|
99
|
+
end
|
100
|
+
|
101
|
+
@indexer.process(@file)
|
102
|
+
|
103
|
+
assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
|
104
|
+
end
|
105
|
+
|
106
|
+
describe "demo_config.rb" do
|
107
|
+
before do
|
108
|
+
@indexer = Traject::Indexer.new(
|
109
|
+
"solrj_writer.server_class_name" => "MockSolrServer",
|
110
|
+
"solr.url" => "http://example.org",
|
111
|
+
"writer_class_name" => "Traject::NullWriter"
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
it "parses and loads" do
|
116
|
+
conf_path = support_file_path "demo_config.rb"
|
117
|
+
File.open(conf_path) do |file_io|
|
118
|
+
@indexer.instance_eval(file_io.read, conf_path)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
83
122
|
|
84
|
-
end
|
123
|
+
end
|