traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
require 'traject/indexer'
|
|
4
|
+
require 'traject/macros/marc21'
|
|
5
|
+
|
|
6
|
+
require 'json'
|
|
7
|
+
require 'marc'
|
|
8
|
+
|
|
9
|
+
# See also marc_extractor_test.rb for more detailed tests on marc extraction,
|
|
10
|
+
# this is just a basic test to make sure our macro works passing through to there
|
|
11
|
+
# and other options.
|
|
12
|
+
describe "Traject::Macros::Marc21" do
|
|
13
|
+
Marc21 = Traject::Macros::Marc21 # shortcut
|
|
14
|
+
|
|
15
|
+
before do
|
|
16
|
+
@indexer = Traject::Indexer.new
|
|
17
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe "extract_marc" do
|
|
21
|
+
it "extracts marc" do
|
|
22
|
+
@indexer.instance_eval do
|
|
23
|
+
to_field "title", extract_marc("245ab")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
output = @indexer.map_record(@record)
|
|
27
|
+
|
|
28
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
|
|
29
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "respects :first=>true option" do
|
|
34
|
+
@indexer.instance_eval do
|
|
35
|
+
to_field "other_id", extract_marc("035a", :first => true)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
output = @indexer.map_record(@record)
|
|
39
|
+
|
|
40
|
+
assert_length 1, output["other_id"]
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "trims punctuation with :trim_punctuation => true" do
|
|
45
|
+
@indexer.instance_eval do
|
|
46
|
+
to_field "title", extract_marc("245ab", :trim_punctuation => true)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
output = @indexer.map_record(@record)
|
|
50
|
+
|
|
51
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
|
|
52
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "respects :default option" do
|
|
57
|
+
@indexer.instance_eval do
|
|
58
|
+
to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
|
|
59
|
+
end
|
|
60
|
+
output = @indexer.map_record(@record)
|
|
61
|
+
|
|
62
|
+
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "de-duplicates by default, respects :allow_duplicates" do
|
|
66
|
+
# Add a second 008
|
|
67
|
+
f = @record.fields('008').first
|
|
68
|
+
@record.append(f)
|
|
69
|
+
|
|
70
|
+
@indexer.instance_eval do
|
|
71
|
+
to_field "lang1", extract_marc('008[35-37]')
|
|
72
|
+
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
output = @indexer.map_record(@record)
|
|
76
|
+
assert_equal ["eng"], output['lang1']
|
|
77
|
+
assert_equal ["eng", "eng"], output['lang2']
|
|
78
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it "fails on an extra/misspelled argument to extract_marc" do
|
|
82
|
+
assert_raises(RuntimeError) do
|
|
83
|
+
@indexer.instance_eval do
|
|
84
|
+
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
it "Marc21::trim_punctuation class method" do
|
|
93
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three")
|
|
94
|
+
|
|
95
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three,")
|
|
96
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three/")
|
|
97
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three;")
|
|
98
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three:")
|
|
99
|
+
assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
|
|
100
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three.")
|
|
101
|
+
|
|
102
|
+
assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
|
|
103
|
+
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
|
104
|
+
assert_equal "one two three", Marc21.trim_punctuation("[one two three")
|
|
105
|
+
assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
|
|
106
|
+
|
|
107
|
+
# This one was a bug before
|
|
108
|
+
assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "uses :translation_map" do
|
|
112
|
+
@indexer.instance_eval do
|
|
113
|
+
to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
|
|
114
|
+
end
|
|
115
|
+
output = @indexer.map_record(@record)
|
|
116
|
+
|
|
117
|
+
assert_equal ["Library of Congress"], output["cataloging_agency"]
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe "serialized_marc" do
|
|
122
|
+
it "serializes xml" do
|
|
123
|
+
@indexer.instance_eval do
|
|
124
|
+
to_field "marc_record", serialized_marc(:format => "xml")
|
|
125
|
+
end
|
|
126
|
+
output = @indexer.map_record(@record)
|
|
127
|
+
|
|
128
|
+
assert_length 1, output["marc_record"]
|
|
129
|
+
assert_kind_of String, output["marc_record"].first
|
|
130
|
+
roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
|
|
131
|
+
assert_equal @record, roundtrip_record
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it "serializes binary UUEncoded" do
|
|
135
|
+
@indexer.instance_eval do
|
|
136
|
+
to_field "marc_record", serialized_marc(:format => "binary")
|
|
137
|
+
end
|
|
138
|
+
output = @indexer.map_record(@record)
|
|
139
|
+
|
|
140
|
+
assert_length 1, output["marc_record"]
|
|
141
|
+
assert_kind_of String, output["marc_record"].first
|
|
142
|
+
|
|
143
|
+
decoded = Base64.decode64( output["marc_record"].first )
|
|
144
|
+
|
|
145
|
+
# just check the marc header for now
|
|
146
|
+
assert_start_with "02067cam a2200469", decoded
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it "serializes binary raw" do
|
|
150
|
+
@indexer.instance_eval do
|
|
151
|
+
to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
|
|
152
|
+
end
|
|
153
|
+
output = @indexer.map_record(@record)
|
|
154
|
+
|
|
155
|
+
assert_length 1, output["marc_record"]
|
|
156
|
+
assert_kind_of String, output["marc_record"].first
|
|
157
|
+
|
|
158
|
+
# just check the marc header for now
|
|
159
|
+
assert_start_with "02067cam a2200469", output["marc_record"].first
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
it "serializes json" do
|
|
163
|
+
@indexer.instance_eval do
|
|
164
|
+
to_field "marc_record", serialized_marc(:format => "json")
|
|
165
|
+
end
|
|
166
|
+
output = @indexer.map_record(@record)
|
|
167
|
+
|
|
168
|
+
assert_length 1, output["marc_record"]
|
|
169
|
+
|
|
170
|
+
# okay, let's actually deserialize it, why not
|
|
171
|
+
|
|
172
|
+
hash = JSON.parse( output["marc_record"].first )
|
|
173
|
+
|
|
174
|
+
deserialized = MARC::Record.new_from_hash(hash)
|
|
175
|
+
|
|
176
|
+
assert_equal @record, deserialized
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "#extract_all_marc_values" do
|
|
181
|
+
@indexer.instance_eval do
|
|
182
|
+
to_field "text", extract_all_marc_values
|
|
183
|
+
end
|
|
184
|
+
output = @indexer.map_record(@record)
|
|
185
|
+
|
|
186
|
+
assert_length 13, output["text"]
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
describe "Indexer Macros:" do
|
|
4
|
+
before do
|
|
5
|
+
@indexer = Traject::Indexer.new
|
|
6
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it "works with simple literal" do
|
|
10
|
+
@indexer.instance_eval do
|
|
11
|
+
extend Traject::Macros::Basic
|
|
12
|
+
|
|
13
|
+
to_field "source", literal("MY LIBRARY")
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
output = @indexer.map_record(@record)
|
|
17
|
+
|
|
18
|
+
assert_equal ["MY LIBRARY"], output["source"]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "works with macro AND block" do
|
|
22
|
+
called = false
|
|
23
|
+
|
|
24
|
+
@indexer.instance_eval do
|
|
25
|
+
extend Traject::Macros::Basic
|
|
26
|
+
to_field "source", literal("MY LIBRARY") do |record, accumulator, context|
|
|
27
|
+
called = true
|
|
28
|
+
accumulator << "SECOND VALUE"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
output = @indexer.map_record(@record)
|
|
33
|
+
|
|
34
|
+
assert called
|
|
35
|
+
assert_equal ["MY LIBRARY", "SECOND VALUE"], output["source"]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
describe "Traject::Indexer#map_record" do
|
|
4
|
+
before do
|
|
5
|
+
@indexer = Traject::Indexer.new
|
|
6
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
describe "with no indexing rules" do
|
|
11
|
+
it "returns empty hash" do
|
|
12
|
+
output = @indexer.map_record(@record)
|
|
13
|
+
|
|
14
|
+
assert_kind_of Hash, output
|
|
15
|
+
assert_empty output
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe "#to_field" do
|
|
20
|
+
it "works with block" do
|
|
21
|
+
called = false
|
|
22
|
+
|
|
23
|
+
@indexer.to_field("title") do |record, accumulator|
|
|
24
|
+
assert_kind_of MARC::Record, record
|
|
25
|
+
assert_kind_of Array, accumulator
|
|
26
|
+
|
|
27
|
+
called = true # by the power of closure!
|
|
28
|
+
accumulator << "Some Title"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
output = @indexer.map_record(@record)
|
|
32
|
+
|
|
33
|
+
assert called
|
|
34
|
+
assert_kind_of Hash, output
|
|
35
|
+
assert_equal ["Some Title"], output["title"]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it "works with a lambda arg" do
|
|
39
|
+
called = false
|
|
40
|
+
|
|
41
|
+
logic = lambda do |record, accumulator|
|
|
42
|
+
assert_kind_of MARC::Record, record
|
|
43
|
+
assert_kind_of Array, accumulator
|
|
44
|
+
|
|
45
|
+
called = true # by the power of closure!
|
|
46
|
+
accumulator << "Some Title"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@indexer.to_field("title", logic)
|
|
50
|
+
|
|
51
|
+
output = @indexer.map_record(@record)
|
|
52
|
+
|
|
53
|
+
assert called
|
|
54
|
+
assert_kind_of Hash, output
|
|
55
|
+
assert_equal ["Some Title"], output["title"]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "works with both lambda and Proc" do
|
|
59
|
+
block_called = false
|
|
60
|
+
|
|
61
|
+
lambda_arg = lambda do |record, accumulator|
|
|
62
|
+
accumulator << "Lambda-provided Value"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
@indexer.to_field("title", lambda_arg) do |record, accumulator|
|
|
66
|
+
assert_includes accumulator, "Lambda-provided Value"
|
|
67
|
+
accumulator << "Block-provided Value"
|
|
68
|
+
|
|
69
|
+
block_called = true
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
output = @indexer.map_record(@record)
|
|
73
|
+
|
|
74
|
+
assert block_called
|
|
75
|
+
assert_includes output["title"], "Lambda-provided Value"
|
|
76
|
+
assert_includes output["title"], "Block-provided Value"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
describe "multiple to_field blocks" do
|
|
81
|
+
it "get called in order" do
|
|
82
|
+
order = []
|
|
83
|
+
@indexer.to_field("title") do |rec, acc|
|
|
84
|
+
order << :first_one
|
|
85
|
+
acc << "First"
|
|
86
|
+
end
|
|
87
|
+
@indexer.to_field("title") do |rec, acc|
|
|
88
|
+
order << :second_one
|
|
89
|
+
acc << "Second"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
output = @indexer.map_record(@record)
|
|
93
|
+
|
|
94
|
+
assert_equal [:first_one, :second_one], order
|
|
95
|
+
assert_equal ["First", "Second"], output["title"]
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe "context argument" do
|
|
100
|
+
it "is third argument to block" do
|
|
101
|
+
called = false
|
|
102
|
+
@indexer.to_field("title") do |record, accumulator, context|
|
|
103
|
+
called = true
|
|
104
|
+
|
|
105
|
+
assert_kind_of Traject::Indexer::Context, context
|
|
106
|
+
|
|
107
|
+
assert_kind_of Hash, context.clipboard
|
|
108
|
+
assert_kind_of Hash, context.output_hash
|
|
109
|
+
|
|
110
|
+
assert_same @record, record
|
|
111
|
+
assert_same record, context.source_record
|
|
112
|
+
assert_same @indexer.settings, context.settings
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
@indexer.map_record @record
|
|
116
|
+
|
|
117
|
+
assert called
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe "#each_record" do
|
|
122
|
+
it "is called with one-arg record" do
|
|
123
|
+
called = false
|
|
124
|
+
@indexer.each_record do |record|
|
|
125
|
+
called = true
|
|
126
|
+
assert_kind_of MARC::Record, record
|
|
127
|
+
end
|
|
128
|
+
@indexer.map_record(@record)
|
|
129
|
+
|
|
130
|
+
assert called, "each_record was called"
|
|
131
|
+
end
|
|
132
|
+
it "is called with two-arg record and context" do
|
|
133
|
+
called = false
|
|
134
|
+
@indexer.each_record do |record, context|
|
|
135
|
+
called = true
|
|
136
|
+
assert_kind_of MARC::Record, record
|
|
137
|
+
assert_kind_of Traject::Indexer::Context, context
|
|
138
|
+
end
|
|
139
|
+
@indexer.map_record(@record)
|
|
140
|
+
|
|
141
|
+
assert called, "each_record was called"
|
|
142
|
+
end
|
|
143
|
+
it "accepts lambda AND block" do
|
|
144
|
+
lambda_arg = lambda do |record, context|
|
|
145
|
+
context.output_hash["field"] ||= []
|
|
146
|
+
context.output_hash["field"] << "first"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
@indexer.each_record(lambda_arg) do |record, context|
|
|
150
|
+
context.output_hash["field"] ||= []
|
|
151
|
+
context.output_hash["field"] << "second"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
output = @indexer.map_record(@record)
|
|
155
|
+
|
|
156
|
+
assert_equal %w{first second}, output["field"]
|
|
157
|
+
end
|
|
158
|
+
it "is called in order with #to_field" do
|
|
159
|
+
@indexer.to_field("foo") {|record, accumulator| accumulator << "first"}
|
|
160
|
+
@indexer.each_record {|record, context| context.output_hash["foo"] << "second" }
|
|
161
|
+
@indexer.to_field("foo") {|record, accumulator| accumulator << "third"}
|
|
162
|
+
|
|
163
|
+
output = @indexer.map_record(@record)
|
|
164
|
+
|
|
165
|
+
assert_equal %w{first second third}, output["foo"]
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
describe "map_to_context!" do
|
|
170
|
+
before do
|
|
171
|
+
@context = Traject::Indexer::Context.new(:source_record => @record, :settings => @indexer.settings, :position => 10 )
|
|
172
|
+
end
|
|
173
|
+
it "passes context to indexing routines" do
|
|
174
|
+
called = false
|
|
175
|
+
@indexer.to_field("title") do |record, accumulator, context|
|
|
176
|
+
called = true
|
|
177
|
+
assert_kind_of Traject::Indexer::Context, context
|
|
178
|
+
assert_same @context, context
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
context = @indexer.map_to_context!(@context)
|
|
182
|
+
|
|
183
|
+
assert_same @context, context
|
|
184
|
+
|
|
185
|
+
assert called, "Called mapping routine"
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it "skips records" do
|
|
189
|
+
|
|
190
|
+
@indexer.to_field("beforeSkip") do |rec, acc|
|
|
191
|
+
acc << "Before"
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
@indexer.to_field('radical') do |rec, acc, context|
|
|
195
|
+
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
@indexer.to_field('afterSkip') do |rec, acc|
|
|
199
|
+
acc << "After. Should never happen"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
output = @indexer.map_record(@record)
|
|
203
|
+
assert_equal ['Before'], output['beforeSkip']
|
|
204
|
+
assert_nil output['afterSkip']
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
# A little Traject Writer that just keeps everything
|
|
4
|
+
# in an array, just added to settings for easy access
|
|
5
|
+
memory_writer_class = Class.new do
|
|
6
|
+
def initialize(settings)
|
|
7
|
+
# store them in a class variable so we can test em later
|
|
8
|
+
@@last_writer_settings = @settings = settings
|
|
9
|
+
@settings["memory_writer.added"] = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def put(hash)
|
|
13
|
+
@settings["memory_writer.added"] << hash
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def close
|
|
17
|
+
@settings["memory_writer.closed"] = true
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe "Traject::Indexer#process" do
|
|
22
|
+
before do
|
|
23
|
+
# no threading for these tests
|
|
24
|
+
@indexer = Traject::Indexer.new("processing_thread_pool" => nil)
|
|
25
|
+
@indexer.writer_class = memory_writer_class
|
|
26
|
+
@file = File.open(support_file_path "test_data.utf8.mrc")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "works" do
|
|
30
|
+
# oops, this times_called counter isn't thread-safe under multi-threading
|
|
31
|
+
# is why this fails sometimes.
|
|
32
|
+
# fixed to be single-threaded for these tests.
|
|
33
|
+
times_called = 0
|
|
34
|
+
@indexer.to_field("title") do |record, accumulator, context|
|
|
35
|
+
times_called += 1
|
|
36
|
+
accumulator << "ADDED TITLE"
|
|
37
|
+
|
|
38
|
+
assert context.index_step, "Context has #index_step set"
|
|
39
|
+
assert_equal "title", context.index_step.field_name
|
|
40
|
+
|
|
41
|
+
assert context.logger, "Context knows #logger"
|
|
42
|
+
|
|
43
|
+
assert_equal times_called, context.position
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
return_value = @indexer.process( @file )
|
|
47
|
+
|
|
48
|
+
assert return_value, "Returns `true` on success"
|
|
49
|
+
|
|
50
|
+
# Grab the settings out of a class variable where we left em,
|
|
51
|
+
# as a convenient place to store outcomes so we can test em.
|
|
52
|
+
writer_settings = memory_writer_class.class_variable_get("@@last_writer_settings")
|
|
53
|
+
|
|
54
|
+
assert writer_settings["memory_writer.added"]
|
|
55
|
+
assert_equal 30, writer_settings["memory_writer.added"].length
|
|
56
|
+
assert_kind_of Traject::Indexer::Context, writer_settings["memory_writer.added"].first
|
|
57
|
+
assert_equal ["ADDED TITLE"], writer_settings["memory_writer.added"].first.output_hash["title"]
|
|
58
|
+
|
|
59
|
+
# logger provided in settings
|
|
60
|
+
assert writer_settings["logger"]
|
|
61
|
+
|
|
62
|
+
assert writer_settings["memory_writer.closed"]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
require 'traject/null_writer'
|
|
66
|
+
it "calls after_processing after processing" do
|
|
67
|
+
@indexer = Traject::Indexer.new(
|
|
68
|
+
"writer_class_name" => "Traject::NullWriter"
|
|
69
|
+
)
|
|
70
|
+
@file = File.open(support_file_path "test_data.utf8.mrc")
|
|
71
|
+
|
|
72
|
+
called = []
|
|
73
|
+
|
|
74
|
+
@indexer.after_processing do
|
|
75
|
+
called << :one
|
|
76
|
+
end
|
|
77
|
+
@indexer.after_processing do
|
|
78
|
+
called << :two
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
@indexer.process(@file)
|
|
82
|
+
|
|
83
|
+
assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
describe "demo_config.rb" do
|
|
87
|
+
before do
|
|
88
|
+
@indexer = Traject::Indexer.new(
|
|
89
|
+
"writer_class_name" => "Traject::NullWriter"
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it "parses and loads" do
|
|
94
|
+
conf_path = support_file_path "demo_config.rb"
|
|
95
|
+
File.open(conf_path) do |file_io|
|
|
96
|
+
@indexer.instance_eval(file_io.read, conf_path)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
end
|