traject 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
@@ -13,19 +13,28 @@ module Traject
|
|
13
13
|
# or array of strings.
|
14
14
|
#
|
15
15
|
# What makes it more useful than a stunted hash is it's ability to load
|
16
|
-
# the hash definitions from configuration files, either pure ruby,
|
17
|
-
# yaml, or java .properties
|
18
|
-
# no = or : in key names, no split lines.)
|
16
|
+
# the hash definitions from configuration files, either pure ruby,
|
17
|
+
# yaml, or (limited subset of) java .properties file.
|
19
18
|
#
|
20
|
-
#
|
19
|
+
# traject's `extract_marc` macro allows you to specify a :translation_map=>filename argument
|
20
|
+
# that will automatically find and use a translation map on the resulting data:
|
21
21
|
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
22
|
+
# extract_marc("040a", :translation_map => "languages")
|
23
|
+
#
|
24
|
+
# Or you can always create one yourself and use it how you like:
|
25
|
+
#
|
26
|
+
# map = TranslationMap.new("languages")
|
27
|
+
#
|
28
|
+
# In either case, TranslationMap will look for a file named, in that example,
|
29
|
+
# `languages.rb` or `languages.yaml` or `languages.properties`,
|
30
|
+
# somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
|
31
|
+
#
|
32
|
+
# * Also looks for "/translation_maps" subdir in load paths, so
|
25
33
|
# for instance you can have a gem that keeps translation maps
|
26
|
-
# in ./lib/translation_maps, and it Just Works.
|
34
|
+
# in ./lib/translation_maps, and it Just Works.
|
35
|
+
#
|
27
36
|
# * Note you do NOT supply the .rb, .yaml, or .properties suffix yourself,
|
28
|
-
#
|
37
|
+
# it'll use whichever it finds (allows calling code to not care which is used).
|
29
38
|
#
|
30
39
|
# Ruby files just need to have their last line eval to a hash. They file
|
31
40
|
# will be run through `eval`, don't do it with untrusted content (naturally)
|
@@ -33,7 +42,7 @@ module Traject
|
|
33
42
|
# You can also pass in a Hash for consistency to TranslationMap.new, although
|
34
43
|
# I don't know why you'd want to.
|
35
44
|
#
|
36
|
-
#
|
45
|
+
# ## Special default handling
|
37
46
|
#
|
38
47
|
# The key "__default__" in the hash is treated specially. If set to a string,
|
39
48
|
# that string will be returned by the TranslationMap for any input not otherwise
|
@@ -50,19 +59,20 @@ module Traject
|
|
50
59
|
# TranslationMap.new("something", :default => "foo")
|
51
60
|
# TranslationMap.new("something", :default => :passthrough)
|
52
61
|
#
|
53
|
-
#
|
62
|
+
# ## Output: String or array of strings
|
54
63
|
#
|
55
64
|
# The output can be a string or an array of strings, or nil. It should not be anything else.
|
56
65
|
# When used with the #translate_array! method, one string can be replaced by multiple values
|
57
66
|
# (array of strings) or removed (nil)
|
58
67
|
#
|
59
|
-
# There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
|
68
|
+
# There's no way to specify multiple return values in a .properties, use .yaml or .rb for that.
|
69
|
+
#
|
70
|
+
# ## Caching
|
60
71
|
#
|
61
|
-
# == Caching
|
62
72
|
# Lookup and loading of configuration files will be cached, for efficiency.
|
63
73
|
# You can reset with `TranslationMap.reset_cache!`
|
64
74
|
#
|
65
|
-
#
|
75
|
+
# ## YAML example:
|
66
76
|
#
|
67
77
|
# key: value
|
68
78
|
# key2: value2 multiple words fine
|
@@ -71,13 +81,35 @@ module Traject
|
|
71
81
|
# - array
|
72
82
|
# - of
|
73
83
|
# - values look like this
|
84
|
+
#
|
85
|
+
# ## Alternatives
|
86
|
+
# `Traject::TranslationMap` provides an easy way to deal with the most common translation case:
|
87
|
+
# simple key-value stores with optional default values.
|
88
|
+
#
|
89
|
+
# If you need more complex translation, you can simply use `#map!`
|
90
|
+
# or its kin to work on the `accumulator` in a block
|
91
|
+
#
|
92
|
+
#
|
93
|
+
#
|
94
|
+
# # get a lousy language detection of any vernacular title
|
95
|
+
# require 'whatlanguage'
|
96
|
+
# wl = WhatLanguage.new(:all)
|
97
|
+
# to_field 'vernacular_langauge', extract_marc('245', :alternate_script=>:only) do |rec, acc|
|
98
|
+
# # accumulator is already filled with the values of any 880s that reference a 245 because
|
99
|
+
# # of the call to #extract_marc
|
100
|
+
# acc.map! {|x| wl.language(x) }
|
101
|
+
# acc.uniq!
|
102
|
+
# end
|
103
|
+
# Within the block, you may also be interested in using:
|
104
|
+
# * a case-insentive hash, perhaps like [this one](https://github.com/junegunn/insensitive_hash)
|
105
|
+
# * a [MatchMap](https://github.com/billdueber/match_map), which implements pattern-matching logic similar to solrmarc's pattern files
|
74
106
|
class TranslationMap
|
75
107
|
class Cache
|
76
108
|
def initialize
|
77
|
-
@cached = Hash.new
|
109
|
+
@cached = Hash.new
|
78
110
|
end
|
79
111
|
|
80
|
-
# Returns an actual Hash -- or nil if none found.
|
112
|
+
# Returns an actual Hash -- or nil if none found.
|
81
113
|
def lookup(path)
|
82
114
|
unless @cached.has_key?(path)
|
83
115
|
@cached[path] = _lookup!(path)
|
@@ -86,9 +118,9 @@ module Traject
|
|
86
118
|
end
|
87
119
|
|
88
120
|
# force lookup, without using cache.
|
89
|
-
# used by cache. Returns the actual hash.
|
90
|
-
# Returns nil if none found.
|
91
|
-
# May raise on syntax error in file being loaded.
|
121
|
+
# used by cache. Returns the actual hash.
|
122
|
+
# Returns nil if none found.
|
123
|
+
# May raise on syntax error in file being loaded.
|
92
124
|
def _lookup!(path)
|
93
125
|
found = nil
|
94
126
|
|
@@ -110,7 +142,7 @@ module Traject
|
|
110
142
|
end
|
111
143
|
|
112
144
|
# Cached hash can't be mutated without weird consequences, let's
|
113
|
-
# freeze it!
|
145
|
+
# freeze it!
|
114
146
|
found.freeze if found
|
115
147
|
|
116
148
|
return found
|
@@ -163,20 +195,20 @@ module Traject
|
|
163
195
|
alias_method :map, :[]
|
164
196
|
|
165
197
|
# Returns a dup of internal hash, dup so you can modify it
|
166
|
-
# if you like.
|
198
|
+
# if you like.
|
167
199
|
def to_hash
|
168
200
|
@hash.dup
|
169
201
|
end
|
170
202
|
|
171
203
|
# Run every element of an array through this translation map,
|
172
204
|
# return the resulting array. If translation map returns nil,
|
173
|
-
# original element will be missing from output.
|
205
|
+
# original element will be missing from output.
|
174
206
|
#
|
175
207
|
# If an input maps to an array, each element of the array will be flattened
|
176
|
-
# into the output.
|
208
|
+
# into the output.
|
177
209
|
#
|
178
210
|
# If an input maps to nil, it will cause the input element to be removed
|
179
|
-
# entirely.
|
211
|
+
# entirely.
|
180
212
|
def translate_array(array)
|
181
213
|
array.each_with_object([]) do |input_element, output_array|
|
182
214
|
output_element = self.map(input_element)
|
@@ -200,7 +232,7 @@ module Traject
|
|
200
232
|
|
201
233
|
protected
|
202
234
|
|
203
|
-
# No built-in way to read java-style .properties, we hack it.
|
235
|
+
# No built-in way to read java-style .properties, we hack it.
|
204
236
|
# inspired by various hacky things found google ruby java properties parse
|
205
237
|
# .properties spec seems to be:
|
206
238
|
# http://docs.oracle.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
|
@@ -213,10 +245,10 @@ module Traject
|
|
213
245
|
f.each_line do |line|
|
214
246
|
i += 1
|
215
247
|
|
216
|
-
line.strip!
|
248
|
+
line.strip!
|
217
249
|
|
218
250
|
# skip blank lines
|
219
|
-
next if line.empty?
|
251
|
+
next if line.empty?
|
220
252
|
|
221
253
|
# skip comment lines
|
222
254
|
next if line =~ /^\s*[!\#].*$/
|
data/lib/traject/util.rb
CHANGED
@@ -73,9 +73,9 @@ module Traject
|
|
73
73
|
end
|
74
74
|
|
75
75
|
# just does a `require 'java'` but rescues the exception if we
|
76
|
-
# aren't jruby, and raises a better error message.
|
76
|
+
# aren't jruby, and raises a better error message.
|
77
77
|
# Pass in a developer-presentable name of a feature to include in the error
|
78
|
-
# message if you want.
|
78
|
+
# message if you want.
|
79
79
|
def self.jruby_ensure_init!(feature = nil)
|
80
80
|
begin
|
81
81
|
require 'java'
|
@@ -89,4 +89,4 @@ module Traject
|
|
89
89
|
end
|
90
90
|
|
91
91
|
end
|
92
|
-
end
|
92
|
+
end
|
data/lib/traject/version.rb
CHANGED
data/lib/traject/yaml_writer.rb
CHANGED
data/test/debug_writer_test.rb
CHANGED
@@ -15,11 +15,11 @@ describe 'Simple output' do
|
|
15
15
|
end
|
16
16
|
@io = StringIO.new
|
17
17
|
@writer = Traject::DebugWriter.new("output_stream" => @io)
|
18
|
-
|
18
|
+
|
19
19
|
@id = "2710183"
|
20
20
|
@title = "Manufacturing consent : the political economy of the mass media /"
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
it "does a simple output" do
|
24
24
|
@writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
25
25
|
expected = [
|
@@ -29,10 +29,10 @@ describe 'Simple output' do
|
|
29
29
|
]
|
30
30
|
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
31
31
|
@writer.close
|
32
|
-
|
32
|
+
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -42,18 +42,18 @@ describe "Traject::Indexer#each_record" do
|
|
42
42
|
flunk("Should only fail with a ArityError")
|
43
43
|
end
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it "rejects each_record with a name (e.g., using a to_field syntax)" do
|
47
47
|
assert_raises(Traject::Indexer::NamingError) do
|
48
48
|
@indexer.each_record('bad_name') {|one, two| }
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
it "reject each_record with no arguments/blocks at all" do
|
53
53
|
assert_raises(ArgumentError) do
|
54
54
|
@indexer.each_record()
|
55
55
|
end
|
56
|
-
end
|
56
|
+
end
|
57
57
|
|
58
58
|
end
|
59
|
-
end
|
59
|
+
end
|
@@ -198,7 +198,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
198
198
|
@record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
|
199
199
|
output = @indexer.map_record(@record)
|
200
200
|
|
201
|
-
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
|
201
|
+
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
|
202
202
|
output["geo_facet"]
|
203
203
|
end
|
204
204
|
it "maps nothing on a record with no geo" do
|
@@ -221,12 +221,12 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
221
221
|
end
|
222
222
|
|
223
223
|
end
|
224
|
-
|
224
|
+
|
225
225
|
describe "extract_marc_filing_version" do
|
226
226
|
before do
|
227
227
|
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
228
228
|
end
|
229
|
-
|
229
|
+
|
230
230
|
it "works as expected" do
|
231
231
|
@indexer.instance_eval do
|
232
232
|
to_field 'title_phrase', extract_marc_filing_version('245ab')
|
@@ -234,7 +234,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
234
234
|
output = @indexer.map_record(@record)
|
235
235
|
assert_equal ['Business renaissance quarterly'], output['title_phrase']
|
236
236
|
end
|
237
|
-
|
237
|
+
|
238
238
|
it "works with :include_original" do
|
239
239
|
@indexer.instance_eval do
|
240
240
|
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
|
@@ -242,7 +242,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
242
242
|
output = @indexer.map_record(@record)
|
243
243
|
assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
|
244
244
|
end
|
245
|
-
|
245
|
+
|
246
246
|
it "doesn't do anything if you don't include the first subfield" do
|
247
247
|
@indexer.instance_eval do
|
248
248
|
to_field 'title_phrase', extract_marc_filing_version('245h')
|
@@ -250,8 +250,8 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
250
250
|
output = @indexer.map_record(@record)
|
251
251
|
assert_equal ['[electronic resource].'], output['title_phrase']
|
252
252
|
end
|
253
|
-
|
254
|
-
|
253
|
+
|
254
|
+
|
255
255
|
it "dies if you pass it something else" do
|
256
256
|
assert_raises(RuntimeError) do
|
257
257
|
@indexer.instance_eval do
|
@@ -259,10 +259,10 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
259
259
|
end
|
260
260
|
end
|
261
261
|
end
|
262
|
-
|
263
|
-
|
262
|
+
|
263
|
+
|
264
264
|
end
|
265
|
-
|
266
|
-
|
267
265
|
|
268
|
-
|
266
|
+
|
267
|
+
|
268
|
+
end
|
@@ -56,22 +56,22 @@ describe "Traject::Macros::Marc21" do
|
|
56
56
|
|
57
57
|
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
it "de-duplicates by default, respects :allow_duplicates" do
|
61
61
|
# Add a second 008
|
62
62
|
f = @record.fields('008').first
|
63
63
|
@record.append(f)
|
64
|
-
|
64
|
+
|
65
65
|
@indexer.instance_eval do
|
66
66
|
to_field "lang1", extract_marc('008[35-37]')
|
67
|
-
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
|
67
|
+
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
output = @indexer.map_record(@record)
|
71
71
|
assert_equal ["eng"], output['lang1']
|
72
|
-
assert_equal ["eng", "eng"], output['lang2']
|
72
|
+
assert_equal ["eng", "eng"], output['lang2']
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
it "fails on an extra/misspelled argument to extract_marc" do
|
76
76
|
assert_raises(RuntimeError) do
|
77
77
|
@indexer.instance_eval do
|
@@ -79,9 +79,9 @@ describe "Traject::Macros::Marc21" do
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
|
83
|
+
|
84
|
+
|
85
85
|
|
86
86
|
it "Marc21::trim_punctuation class method" do
|
87
87
|
assert_equal "one two three", Marc21.trim_punctuation("one two three")
|
@@ -177,4 +177,4 @@ describe "Traject::Macros::Marc21" do
|
|
177
177
|
end
|
178
178
|
|
179
179
|
|
180
|
-
end
|
180
|
+
end
|
data/test/indexer/macros_test.rb
CHANGED
@@ -184,21 +184,21 @@ describe "Traject::Indexer#map_record" do
|
|
184
184
|
|
185
185
|
assert called, "Called mapping routine"
|
186
186
|
end
|
187
|
-
|
187
|
+
|
188
188
|
it "skips records" do
|
189
|
-
|
189
|
+
|
190
190
|
@indexer.to_field("beforeSkip") do |rec, acc|
|
191
191
|
acc << "Before"
|
192
192
|
end
|
193
|
-
|
193
|
+
|
194
194
|
@indexer.to_field('radical') do |rec, acc, context|
|
195
195
|
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
196
|
end
|
197
|
-
|
197
|
+
|
198
198
|
@indexer.to_field('afterSkip') do |rec, acc|
|
199
199
|
acc << "After. Should never happen"
|
200
200
|
end
|
201
|
-
|
201
|
+
|
202
202
|
output = @indexer.map_record(@record)
|
203
203
|
assert_equal ['Before'], output['beforeSkip']
|
204
204
|
assert_nil output['afterSkip']
|
@@ -206,4 +206,4 @@ describe "Traject::Indexer#map_record" do
|
|
206
206
|
|
207
207
|
end
|
208
208
|
|
209
|
-
end
|
209
|
+
end
|
@@ -27,9 +27,9 @@ describe "Traject::Indexer#process" do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it "works" do
|
30
|
-
# oops, this times_called counter isn't thread-safe under multi-threading
|
30
|
+
# oops, this times_called counter isn't thread-safe under multi-threading
|
31
31
|
# is why this fails sometimes.
|
32
|
-
# fixed to be single-threaded for these tests.
|
32
|
+
# fixed to be single-threaded for these tests.
|
33
33
|
times_called = 0
|
34
34
|
@indexer.to_field("title") do |record, accumulator, context|
|
35
35
|
times_called += 1
|
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
|
|
68
68
|
"solr.url" => "http://example.org",
|
69
69
|
"writer_class_name" => "Traject::SolrJWriter"
|
70
70
|
)
|
71
|
-
@file = File.open(support_file_path "manufacturing_consent.marc")
|
71
|
+
@file = File.open(support_file_path "manufacturing_consent.marc")
|
72
72
|
|
73
73
|
|
74
74
|
@indexer.to_field("id") do |record, accumulator|
|
@@ -80,5 +80,44 @@ describe "Traject::Indexer#process" do
|
|
80
80
|
assert ! return_value, "returns false on skipped record errors"
|
81
81
|
end
|
82
82
|
|
83
|
+
require 'traject/null_writer'
|
84
|
+
it "calls after_processing after processing" do
|
85
|
+
@indexer = Traject::Indexer.new(
|
86
|
+
"solrj_writer.server_class_name" => "MockSolrServer",
|
87
|
+
"solr.url" => "http://example.org",
|
88
|
+
"writer_class_name" => "Traject::NullWriter"
|
89
|
+
)
|
90
|
+
@file = File.open(support_file_path "test_data.utf8.mrc")
|
91
|
+
|
92
|
+
called = []
|
93
|
+
|
94
|
+
@indexer.after_processing do
|
95
|
+
called << :one
|
96
|
+
end
|
97
|
+
@indexer.after_processing do
|
98
|
+
called << :two
|
99
|
+
end
|
100
|
+
|
101
|
+
@indexer.process(@file)
|
102
|
+
|
103
|
+
assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
|
104
|
+
end
|
105
|
+
|
106
|
+
describe "demo_config.rb" do
|
107
|
+
before do
|
108
|
+
@indexer = Traject::Indexer.new(
|
109
|
+
"solrj_writer.server_class_name" => "MockSolrServer",
|
110
|
+
"solr.url" => "http://example.org",
|
111
|
+
"writer_class_name" => "Traject::NullWriter"
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
it "parses and loads" do
|
116
|
+
conf_path = support_file_path "demo_config.rb"
|
117
|
+
File.open(conf_path) do |file_io|
|
118
|
+
@indexer.instance_eval(file_io.read, conf_path)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
83
122
|
|
84
|
-
end
|
123
|
+
end
|