traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
require 'traject/debug_writer'
|
5
|
+
require 'traject'
|
6
|
+
require 'marc'
|
7
|
+
|
8
|
+
describe 'Simple output' do
|
9
|
+
before do
|
10
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
11
|
+
@indexer = Traject::Indexer.new
|
12
|
+
@indexer.instance_eval do
|
13
|
+
to_field "id", extract_marc("001", :first => true)
|
14
|
+
to_field "title", extract_marc("245ab")
|
15
|
+
end
|
16
|
+
@io = StringIO.new
|
17
|
+
@writer = Traject::DebugWriter.new("output_stream" => @io)
|
18
|
+
|
19
|
+
@id = "2710183"
|
20
|
+
@title = "Manufacturing consent : the political economy of the mass media /"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "does a simple output" do
|
24
|
+
@writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
25
|
+
expected = [
|
26
|
+
"#{@id} id #{@id}",
|
27
|
+
"#{@id} title #{@title}",
|
28
|
+
"\n"
|
29
|
+
]
|
30
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
31
|
+
@writer.close
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'stringio'
|
5
|
+
require 'traject/delimited_writer'
|
6
|
+
require 'traject/csv_writer'
|
7
|
+
|
8
|
+
require 'csv'
|
9
|
+
|
10
|
+
describe "Delimited/CSV Writers" do
|
11
|
+
|
12
|
+
before do
|
13
|
+
@out = StringIO.new
|
14
|
+
@settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
|
15
|
+
@context = Struct.new(:output_hash).new
|
16
|
+
@context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
|
17
|
+
end
|
18
|
+
|
19
|
+
after do
|
20
|
+
@out.close
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "Traject::DelimitedWriter" do
|
24
|
+
|
25
|
+
it "creates a dw with defaults" do
|
26
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
27
|
+
dw.delimiter.must_equal "\t"
|
28
|
+
dw.internal_delimiter.must_equal '|'
|
29
|
+
dw.edelim.must_equal ' '
|
30
|
+
dw.eidelim.must_equal '\\|'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "respects different delimiter" do
|
34
|
+
@settings['delimited_writer.delimiter'] = '^'
|
35
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
36
|
+
dw.delimiter.must_equal '^'
|
37
|
+
dw.edelim.must_equal '\\^'
|
38
|
+
dw.internal_delimiter.must_equal '|'
|
39
|
+
end
|
40
|
+
|
41
|
+
it "outputs a header if asked to" do
|
42
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
43
|
+
@out.string.chomp.must_equal %w[four one two].join("\t")
|
44
|
+
end
|
45
|
+
|
46
|
+
it "doesn't output a header if asked not to" do
|
47
|
+
@settings['delimited_writer.header'] = 'false'
|
48
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
49
|
+
@out.string.must_be_empty
|
50
|
+
end
|
51
|
+
|
52
|
+
it "deals with multiple values" do
|
53
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
54
|
+
dw.put @context
|
55
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "bails if delimited_writer.fields isn't set" do
|
59
|
+
@settings.delete 'delimited_writer.fields'
|
60
|
+
proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "Traject::CSVWriter" do
|
66
|
+
it "unsets the delimiter" do
|
67
|
+
cw = Traject::CSVWriter.new(@settings)
|
68
|
+
cw.delimiter.must_be_nil
|
69
|
+
end
|
70
|
+
|
71
|
+
it "writes the header" do
|
72
|
+
cw = Traject::CSVWriter.new(@settings)
|
73
|
+
@out.string.chomp.must_equal 'four,one,two'
|
74
|
+
end
|
75
|
+
|
76
|
+
it "uses the internal delimiter" do
|
77
|
+
cw = Traject::CSVWriter.new(@settings)
|
78
|
+
cw.put @context
|
79
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
|
80
|
+
end
|
81
|
+
|
82
|
+
it "produces complex output" do
|
83
|
+
@context.output_hash = {
|
84
|
+
'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
|
85
|
+
'one' => 'Willard "Mitt" Romney',
|
86
|
+
'two' => 'Dueber, Bill'
|
87
|
+
}
|
88
|
+
canonical = StringIO.new
|
89
|
+
csv = CSV.new(canonical)
|
90
|
+
|
91
|
+
csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
|
92
|
+
csv << csv_vals
|
93
|
+
csv_output = canonical.string.chomp
|
94
|
+
|
95
|
+
cw = Traject::CSVWriter.new(@settings)
|
96
|
+
cw.put @context
|
97
|
+
traject_csvwriter_output = @out.string.split("\n").last.chomp
|
98
|
+
|
99
|
+
assert_equal(csv_output, traject_csvwriter_output)
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
describe "Traject::Indexer#each_record" do
|
4
|
+
before do
|
5
|
+
@indexer = Traject::Indexer.new
|
6
|
+
end
|
7
|
+
|
8
|
+
describe "checks arguments" do
|
9
|
+
it "rejects no-arg block" do
|
10
|
+
assert_raises(Traject::Indexer::ArityError) do
|
11
|
+
@indexer.each_record do
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
it "rejects three-arg block" do
|
16
|
+
assert_raises(Traject::Indexer::ArityError) do
|
17
|
+
@indexer.each_record do |one, two, three|
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
it "accepts one-arg block" do
|
22
|
+
@indexer.each_record do |record|
|
23
|
+
end
|
24
|
+
end
|
25
|
+
it "accepts two-arg block" do
|
26
|
+
@indexer.each_record do |record, context|
|
27
|
+
end
|
28
|
+
end
|
29
|
+
it "accepts variable arity block" do
|
30
|
+
@indexer.each_record do |*variable|
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
it "outputs error with source location" do
|
35
|
+
begin
|
36
|
+
@indexer.to_field('foo') {|one, two| }
|
37
|
+
@indexer.each_record {|one, two, three| } # bad arity
|
38
|
+
flunk("Should have rejected bad arity ")
|
39
|
+
rescue Traject::Indexer::ArityError => e
|
40
|
+
assert_match(/each_record at .*\/.*:\d+/, e.message)
|
41
|
+
rescue
|
42
|
+
flunk("Should only fail with a ArityError")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it "rejects each_record with a name (e.g., using a to_field syntax)" do
|
47
|
+
assert_raises(Traject::Indexer::NamingError) do
|
48
|
+
@indexer.each_record('bad_name') {|one, two| }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
it "reject each_record with no arguments/blocks at all" do
|
53
|
+
assert_raises(ArgumentError) do
|
54
|
+
@indexer.each_record()
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,391 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
|
5
|
+
require 'traject/indexer'
|
6
|
+
require 'traject/macros/marc21_semantics'
|
7
|
+
|
8
|
+
require 'json'
|
9
|
+
require 'marc/record'
|
10
|
+
|
11
|
+
# See also marc_extractor_test.rb for more detailed tests on marc extraction,
|
12
|
+
# this is just a basic test to make sure our macro works passing through to there
|
13
|
+
# and other options.
|
14
|
+
describe "Traject::Macros::Marc21Semantics" do
|
15
|
+
Marc21Semantics = Traject::Macros::Marc21Semantics # shortcut
|
16
|
+
|
17
|
+
before do
|
18
|
+
@indexer = Traject::Indexer.new
|
19
|
+
@indexer.extend Marc21Semantics
|
20
|
+
|
21
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
22
|
+
end
|
23
|
+
|
24
|
+
it "oclcnum" do
|
25
|
+
@indexer.instance_eval do
|
26
|
+
to_field "oclcnum", oclcnum
|
27
|
+
end
|
28
|
+
output = @indexer.map_record(@record)
|
29
|
+
|
30
|
+
assert_equal %w{47971712}, output["oclcnum"]
|
31
|
+
|
32
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
33
|
+
end
|
34
|
+
|
35
|
+
it "deals with all prefixed OCLC nunbers" do
|
36
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocm111111111']))
|
37
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)222222222']))
|
38
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocm333333333']))
|
39
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocn444444444']))
|
40
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocn555555555']))
|
41
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)on666666666']))
|
42
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '777777777'])) # not OCLC number
|
43
|
+
|
44
|
+
@indexer.instance_eval do
|
45
|
+
to_field "oclcnum", oclcnum
|
46
|
+
end
|
47
|
+
output = @indexer.map_record(@record)
|
48
|
+
|
49
|
+
assert_equal %w{47971712 111111111 222222222 333333333 444444444 555555555 666666666}, output["oclcnum"]
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
it "#marc_series_facet" do
|
55
|
+
@record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
|
56
|
+
|
57
|
+
@indexer.instance_eval do
|
58
|
+
to_field "series_facet", marc_series_facet
|
59
|
+
end
|
60
|
+
output = @indexer.map_record(@record)
|
61
|
+
|
62
|
+
# trims punctuation too
|
63
|
+
assert_equal ["Big bands"], output["series_facet"]
|
64
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "marc_sortable_author" do
|
69
|
+
# these probably should be taking only certain subfields, but we're copying
|
70
|
+
# from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
|
71
|
+
before do
|
72
|
+
@indexer.instance_eval do
|
73
|
+
to_field "author_sort", marc_sortable_author
|
74
|
+
end
|
75
|
+
end
|
76
|
+
it "collates author and title" do
|
77
|
+
output = @indexer.map_record(@record)
|
78
|
+
|
79
|
+
assert_equal ["Herman, Edward S. Manufacturing consent the political economy of the mass media Edward S. Herman and Noam Chomsky ; with a new introduction by the authors"], output["author_sort"]
|
80
|
+
assert_equal [""], @indexer.map_record(empty_record)['author_sort']
|
81
|
+
|
82
|
+
end
|
83
|
+
it "respects non-filing" do
|
84
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
85
|
+
|
86
|
+
output = @indexer.map_record(@record)
|
87
|
+
|
88
|
+
assert_equal ["Business renaissance quarterly [electronic resource]."], output["author_sort"]
|
89
|
+
assert_equal [""], @indexer.map_record(empty_record)['author_sort']
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
describe "marc_sortable_title" do
|
95
|
+
before do
|
96
|
+
@indexer.instance_eval { to_field "title_sort", marc_sortable_title }
|
97
|
+
end
|
98
|
+
it "works" do
|
99
|
+
output = @indexer.map_record(@record)
|
100
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title_sort"]
|
101
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
102
|
+
|
103
|
+
end
|
104
|
+
it "respects non-filing" do
|
105
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
106
|
+
output = @indexer.map_record(@record)
|
107
|
+
|
108
|
+
assert_equal ["Business renaissance quarterly"], output["title_sort"]
|
109
|
+
end
|
110
|
+
it "works with a record with no 245$ab" do
|
111
|
+
@record = MARC::Reader.new(support_file_path "245_no_ab.marc").to_a.first
|
112
|
+
output = @indexer.map_record(@record)
|
113
|
+
assert_equal ["Papers"], output["title_sort"]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe "marc_languages" do
|
118
|
+
before do
|
119
|
+
@indexer.instance_eval {to_field "languages", marc_languages() }
|
120
|
+
end
|
121
|
+
|
122
|
+
it "unpacks packed 041a and translates" do
|
123
|
+
@record = MARC::Reader.new(support_file_path "packed_041a_lang.marc").to_a.first
|
124
|
+
output = @indexer.map_record(@record)
|
125
|
+
|
126
|
+
assert_equal ["English", "French", "German", "Italian", "Spanish", "Russian"], output["languages"]
|
127
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
128
|
+
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "marc_instrumentation_humanized" do
|
133
|
+
before do
|
134
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
135
|
+
@indexer.instance_eval {to_field "instrumentation", marc_instrumentation_humanized }
|
136
|
+
end
|
137
|
+
|
138
|
+
it "translates, de-duping" do
|
139
|
+
output = @indexer.map_record(@record)
|
140
|
+
|
141
|
+
assert_equal ["Larger ensemble, Unspecified", "Piano", "Soprano voice", "Tenor voice", "Violin", "Larger ensemble, Ethnic", "Guitar", "Voices, Unspecified"], output["instrumentation"]
|
142
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
143
|
+
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
describe "marc_instrument_codes_normalized" do
|
148
|
+
before do
|
149
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
150
|
+
@indexer.instance_eval {to_field "instrument_codes", marc_instrument_codes_normalized }
|
151
|
+
end
|
152
|
+
it "normalizes, de-duping" do
|
153
|
+
output = @indexer.map_record(@record)
|
154
|
+
|
155
|
+
assert_equal ["on", "ka01", "ka", "va01", "va", "vd01", "vd", "sa01", "sa", "oy", "tb01", "tb", "vn12", "vn"],
|
156
|
+
output["instrument_codes"]
|
157
|
+
end
|
158
|
+
it "codes soloist 048$b" do
|
159
|
+
@record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
|
160
|
+
output = @indexer.map_record(@record)
|
161
|
+
|
162
|
+
assert_equal ["bb01", "bb01.s", "bb", "bb.s", "oe"], output["instrument_codes"]
|
163
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
describe "publication_date" do
|
169
|
+
# there are way too many edge cases for us to test em all, but we'll test some of em.
|
170
|
+
|
171
|
+
it "works when there's no date information" do
|
172
|
+
assert_equal nil, Marc21Semantics.publication_date(empty_record)
|
173
|
+
end
|
174
|
+
|
175
|
+
it "uses macro correctly with no date info" do
|
176
|
+
@indexer.instance_eval {to_field "date", marc_publication_date }
|
177
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
it "pulls out 008 date_type s" do
|
182
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
183
|
+
assert_equal 2002, Marc21Semantics.publication_date(@record)
|
184
|
+
|
185
|
+
end
|
186
|
+
it "uses start date for date_type c continuing resource" do
|
187
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
188
|
+
assert_equal 2006, Marc21Semantics.publication_date(@record)
|
189
|
+
end
|
190
|
+
it "returns nil when the records really got nothing" do
|
191
|
+
@record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
|
192
|
+
assert_equal nil, Marc21Semantics.publication_date(@record)
|
193
|
+
end
|
194
|
+
it "estimates with a single 'u'" do
|
195
|
+
@record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
|
196
|
+
# was 184u as date1 on a continuing resource. For continuing resources,
|
197
|
+
# we take the first date. And need to deal with the u.
|
198
|
+
assert_equal 1845, Marc21Semantics.publication_date(@record)
|
199
|
+
end
|
200
|
+
it "resorts to 260c" do
|
201
|
+
@record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
|
202
|
+
assert_equal 1980, Marc21Semantics.publication_date(@record)
|
203
|
+
end
|
204
|
+
it "works with date type r missing date2" do
|
205
|
+
@record = MARC::Reader.new(support_file_path "date_type_r_missing_date2.marc").to_a.first
|
206
|
+
assert_equal 1957, Marc21Semantics.publication_date(@record)
|
207
|
+
end
|
208
|
+
|
209
|
+
it "works correctly with date type 'q'" do
|
210
|
+
val = @record['008'].value
|
211
|
+
val[6] = 'q'
|
212
|
+
val[7..10] = '191u'
|
213
|
+
val[11..14] = '192u'
|
214
|
+
@record['008'].value = val
|
215
|
+
|
216
|
+
# Date should be date1 + date2 / 2 = (1910 + 1929) / 2 = 1919
|
217
|
+
estimate_tolerance = 30
|
218
|
+
assert_equal 1919, Marc21Semantics.publication_date(@record, estimate_tolerance)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
describe "marc_lcc_to_broad_category" do
|
223
|
+
before do
|
224
|
+
@indexer.instance_eval {to_field "discipline_facet", marc_lcc_to_broad_category }
|
225
|
+
end
|
226
|
+
it "maps a simple example" do
|
227
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
228
|
+
output = @indexer.map_record(@record)
|
229
|
+
|
230
|
+
assert_equal ["Language & Literature"], output["discipline_facet"]
|
231
|
+
|
232
|
+
end
|
233
|
+
it "maps to default" do
|
234
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
235
|
+
output = @indexer.map_record(@record)
|
236
|
+
assert_equal ["Unknown"], output["discipline_facet"]
|
237
|
+
assert_equal(["Unknown"], @indexer.map_record(empty_record)['discipline_facet'])
|
238
|
+
end
|
239
|
+
|
240
|
+
it "maps to nothing if none and no default" do
|
241
|
+
@indexer.instance_eval {to_field "discipline_no_default", marc_lcc_to_broad_category(:default => nil)}
|
242
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
243
|
+
output = @indexer.map_record(@record)
|
244
|
+
|
245
|
+
assert_nil output["discipline_no_default"]
|
246
|
+
|
247
|
+
assert_nil @indexer.map_record(empty_record)["discipline_no_default"]
|
248
|
+
|
249
|
+
end
|
250
|
+
|
251
|
+
describe "LCC_REGEX" do
|
252
|
+
it "rejects a non-LCC" do
|
253
|
+
refute_match Traject::Macros::Marc21Semantics::LCC_REGEX, "Film no. A .N285"
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
end
|
258
|
+
|
259
|
+
describe "marc_geo_facet" do
|
260
|
+
before do
|
261
|
+
@indexer.instance_eval {to_field "geo_facet", marc_geo_facet }
|
262
|
+
end
|
263
|
+
it "maps a complicated record" do
|
264
|
+
@record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
|
265
|
+
output = @indexer.map_record(@record)
|
266
|
+
|
267
|
+
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"], output["geo_facet"]
|
268
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
269
|
+
end
|
270
|
+
it "maps nothing on a record with no geo" do
|
271
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
272
|
+
output = @indexer.map_record(@record)
|
273
|
+
assert_nil output["geo_facet"]
|
274
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
275
|
+
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
describe "marc_era_facet" do
|
280
|
+
before do
|
281
|
+
@indexer.instance_eval {to_field "era_facet", marc_era_facet}
|
282
|
+
end
|
283
|
+
it "maps a complicated record" do
|
284
|
+
@record = MARC::Reader.new(support_file_path "multi_era.marc").to_a.first
|
285
|
+
output = @indexer.map_record(@record)
|
286
|
+
|
287
|
+
assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
|
288
|
+
output["era_facet"]
|
289
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
290
|
+
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
describe "marc_lcsh_display" do
|
295
|
+
it "formats typical field" do
|
296
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['z', 'England'], ['x', 'History'], ['y', '19th century.'])
|
297
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
298
|
+
|
299
|
+
assert_equal "Psychoanalysis and literature — England — History — 19th century", str
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
it "ignores numeric subfields" do
|
304
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['x', 'History'], ['0', '01234'], ['3', 'Some part'])
|
305
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
306
|
+
|
307
|
+
assert_equal "Psychoanalysis and literature — History", str
|
308
|
+
end
|
309
|
+
|
310
|
+
it "doesn't put subdivision in wrong place" do
|
311
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'])
|
312
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
313
|
+
|
314
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch", str
|
315
|
+
end
|
316
|
+
|
317
|
+
it "mixes non-subdivisions with subdivisions" do
|
318
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'], ['x', 'Criticism.'])
|
319
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
320
|
+
|
321
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch — Criticism", str
|
322
|
+
end
|
323
|
+
|
324
|
+
it "returns nil for a field with no relevant subfields" do
|
325
|
+
field = MARC::DataField.new('650', ' ', ' ')
|
326
|
+
assert_nil Marc21Semantics.assemble_lcsh(field)
|
327
|
+
end
|
328
|
+
|
329
|
+
describe "marc_lcsh_formatted macro" do
|
330
|
+
it "smoke test" do
|
331
|
+
@record = MARC::Reader.new(support_file_path "george_eliot.marc").to_a.first
|
332
|
+
@indexer.instance_eval {to_field "lcsh", marc_lcsh_formatted}
|
333
|
+
output = @indexer.map_record(@record)
|
334
|
+
|
335
|
+
assert output["lcsh"].length > 0, "outputs data"
|
336
|
+
assert output["lcsh"].include?("Eliot, George, 1819-1880 — Characters"), "includes a string its supposed to"
|
337
|
+
|
338
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
339
|
+
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
describe "extract_marc_filing_version" do
|
345
|
+
before do
|
346
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
347
|
+
end
|
348
|
+
|
349
|
+
it "works as expected" do
|
350
|
+
@indexer.instance_eval do
|
351
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab')
|
352
|
+
end
|
353
|
+
output = @indexer.map_record(@record)
|
354
|
+
assert_equal ['Business renaissance quarterly'], output['title_phrase']
|
355
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
356
|
+
|
357
|
+
end
|
358
|
+
|
359
|
+
it "works with :include_original" do
|
360
|
+
@indexer.instance_eval do
|
361
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
|
362
|
+
end
|
363
|
+
output = @indexer.map_record(@record)
|
364
|
+
assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
|
365
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
366
|
+
end
|
367
|
+
|
368
|
+
it "doesn't do anything if you don't include the first subfield" do
|
369
|
+
@indexer.instance_eval do
|
370
|
+
to_field 'title_phrase', extract_marc_filing_version('245h')
|
371
|
+
end
|
372
|
+
output = @indexer.map_record(@record)
|
373
|
+
assert_equal ['[electronic resource].'], output['title_phrase']
|
374
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
375
|
+
|
376
|
+
end
|
377
|
+
|
378
|
+
|
379
|
+
it "dies if you pass it something else" do
|
380
|
+
assert_raises(RuntimeError) do
|
381
|
+
@indexer.instance_eval do
|
382
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true, :uniq => true)
|
383
|
+
end
|
384
|
+
end
|
385
|
+
end
|
386
|
+
|
387
|
+
end
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
end
|