traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
require 'stringio'
|
|
3
|
+
|
|
4
|
+
require 'traject/debug_writer'
|
|
5
|
+
require 'traject'
|
|
6
|
+
require 'marc'
|
|
7
|
+
|
|
8
|
+
describe 'Simple output' do
|
|
9
|
+
before do
|
|
10
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
11
|
+
@indexer = Traject::Indexer.new
|
|
12
|
+
@indexer.instance_eval do
|
|
13
|
+
to_field "id", extract_marc("001", :first => true)
|
|
14
|
+
to_field "title", extract_marc("245ab")
|
|
15
|
+
end
|
|
16
|
+
@io = StringIO.new
|
|
17
|
+
@writer = Traject::DebugWriter.new("output_stream" => @io)
|
|
18
|
+
|
|
19
|
+
@id = "2710183"
|
|
20
|
+
@title = "Manufacturing consent : the political economy of the mass media /"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it "does a simple output" do
|
|
24
|
+
@writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
|
25
|
+
expected = [
|
|
26
|
+
"#{@id} id #{@id}",
|
|
27
|
+
"#{@id} title #{@title}",
|
|
28
|
+
"\n"
|
|
29
|
+
]
|
|
30
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
|
31
|
+
@writer.close
|
|
32
|
+
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
require 'test_helper'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
require 'traject/delimited_writer'
|
|
6
|
+
require 'traject/csv_writer'
|
|
7
|
+
|
|
8
|
+
require 'csv'
|
|
9
|
+
|
|
10
|
+
describe "Delimited/CSV Writers" do
|
|
11
|
+
|
|
12
|
+
before do
|
|
13
|
+
@out = StringIO.new
|
|
14
|
+
@settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
|
|
15
|
+
@context = Struct.new(:output_hash).new
|
|
16
|
+
@context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
after do
|
|
20
|
+
@out.close
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
describe "Traject::DelimitedWriter" do
|
|
24
|
+
|
|
25
|
+
it "creates a dw with defaults" do
|
|
26
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
|
27
|
+
dw.delimiter.must_equal "\t"
|
|
28
|
+
dw.internal_delimiter.must_equal '|'
|
|
29
|
+
dw.edelim.must_equal ' '
|
|
30
|
+
dw.eidelim.must_equal '\\|'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "respects different delimiter" do
|
|
34
|
+
@settings['delimited_writer.delimiter'] = '^'
|
|
35
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
|
36
|
+
dw.delimiter.must_equal '^'
|
|
37
|
+
dw.edelim.must_equal '\\^'
|
|
38
|
+
dw.internal_delimiter.must_equal '|'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "outputs a header if asked to" do
|
|
42
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
|
43
|
+
@out.string.chomp.must_equal %w[four one two].join("\t")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it "doesn't output a header if asked not to" do
|
|
47
|
+
@settings['delimited_writer.header'] = 'false'
|
|
48
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
|
49
|
+
@out.string.must_be_empty
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "deals with multiple values" do
|
|
53
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
|
54
|
+
dw.put @context
|
|
55
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "bails if delimited_writer.fields isn't set" do
|
|
59
|
+
@settings.delete 'delimited_writer.fields'
|
|
60
|
+
proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
describe "Traject::CSVWriter" do
|
|
66
|
+
it "unsets the delimiter" do
|
|
67
|
+
cw = Traject::CSVWriter.new(@settings)
|
|
68
|
+
cw.delimiter.must_be_nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it "writes the header" do
|
|
72
|
+
cw = Traject::CSVWriter.new(@settings)
|
|
73
|
+
@out.string.chomp.must_equal 'four,one,two'
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it "uses the internal delimiter" do
|
|
77
|
+
cw = Traject::CSVWriter.new(@settings)
|
|
78
|
+
cw.put @context
|
|
79
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "produces complex output" do
|
|
83
|
+
@context.output_hash = {
|
|
84
|
+
'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
|
|
85
|
+
'one' => 'Willard "Mitt" Romney',
|
|
86
|
+
'two' => 'Dueber, Bill'
|
|
87
|
+
}
|
|
88
|
+
canonical = StringIO.new
|
|
89
|
+
csv = CSV.new(canonical)
|
|
90
|
+
|
|
91
|
+
csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
|
|
92
|
+
csv << csv_vals
|
|
93
|
+
csv_output = canonical.string.chomp
|
|
94
|
+
|
|
95
|
+
cw = Traject::CSVWriter.new(@settings)
|
|
96
|
+
cw.put @context
|
|
97
|
+
traject_csvwriter_output = @out.string.split("\n").last.chomp
|
|
98
|
+
|
|
99
|
+
assert_equal(csv_output, traject_csvwriter_output)
|
|
100
|
+
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
describe "Traject::Indexer#each_record" do
|
|
4
|
+
before do
|
|
5
|
+
@indexer = Traject::Indexer.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
describe "checks arguments" do
|
|
9
|
+
it "rejects no-arg block" do
|
|
10
|
+
assert_raises(Traject::Indexer::ArityError) do
|
|
11
|
+
@indexer.each_record do
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
it "rejects three-arg block" do
|
|
16
|
+
assert_raises(Traject::Indexer::ArityError) do
|
|
17
|
+
@indexer.each_record do |one, two, three|
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
it "accepts one-arg block" do
|
|
22
|
+
@indexer.each_record do |record|
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
it "accepts two-arg block" do
|
|
26
|
+
@indexer.each_record do |record, context|
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
it "accepts variable arity block" do
|
|
30
|
+
@indexer.each_record do |*variable|
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "outputs error with source location" do
|
|
35
|
+
begin
|
|
36
|
+
@indexer.to_field('foo') {|one, two| }
|
|
37
|
+
@indexer.each_record {|one, two, three| } # bad arity
|
|
38
|
+
flunk("Should have rejected bad arity ")
|
|
39
|
+
rescue Traject::Indexer::ArityError => e
|
|
40
|
+
assert_match(/each_record at .*\/.*:\d+/, e.message)
|
|
41
|
+
rescue
|
|
42
|
+
flunk("Should only fail with a ArityError")
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it "rejects each_record with a name (e.g., using a to_field syntax)" do
|
|
47
|
+
assert_raises(Traject::Indexer::NamingError) do
|
|
48
|
+
@indexer.each_record('bad_name') {|one, two| }
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "reject each_record with no arguments/blocks at all" do
|
|
53
|
+
assert_raises(ArgumentError) do
|
|
54
|
+
@indexer.each_record()
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
# Encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
require 'test_helper'
|
|
4
|
+
|
|
5
|
+
require 'traject/indexer'
|
|
6
|
+
require 'traject/macros/marc21_semantics'
|
|
7
|
+
|
|
8
|
+
require 'json'
|
|
9
|
+
require 'marc/record'
|
|
10
|
+
|
|
11
|
+
# See also marc_extractor_test.rb for more detailed tests on marc extraction,
|
|
12
|
+
# this is just a basic test to make sure our macro works passing through to there
|
|
13
|
+
# and other options.
|
|
14
|
+
describe "Traject::Macros::Marc21Semantics" do
|
|
15
|
+
Marc21Semantics = Traject::Macros::Marc21Semantics # shortcut
|
|
16
|
+
|
|
17
|
+
before do
|
|
18
|
+
@indexer = Traject::Indexer.new
|
|
19
|
+
@indexer.extend Marc21Semantics
|
|
20
|
+
|
|
21
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "oclcnum" do
|
|
25
|
+
@indexer.instance_eval do
|
|
26
|
+
to_field "oclcnum", oclcnum
|
|
27
|
+
end
|
|
28
|
+
output = @indexer.map_record(@record)
|
|
29
|
+
|
|
30
|
+
assert_equal %w{47971712}, output["oclcnum"]
|
|
31
|
+
|
|
32
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it "deals with all prefixed OCLC nunbers" do
|
|
36
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocm111111111']))
|
|
37
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)222222222']))
|
|
38
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocm333333333']))
|
|
39
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocn444444444']))
|
|
40
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocn555555555']))
|
|
41
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)on666666666']))
|
|
42
|
+
@record.append(MARC::DataField.new('035', ' ', ' ', ['a', '777777777'])) # not OCLC number
|
|
43
|
+
|
|
44
|
+
@indexer.instance_eval do
|
|
45
|
+
to_field "oclcnum", oclcnum
|
|
46
|
+
end
|
|
47
|
+
output = @indexer.map_record(@record)
|
|
48
|
+
|
|
49
|
+
assert_equal %w{47971712 111111111 222222222 333333333 444444444 555555555 666666666}, output["oclcnum"]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
it "#marc_series_facet" do
|
|
55
|
+
@record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
|
|
56
|
+
|
|
57
|
+
@indexer.instance_eval do
|
|
58
|
+
to_field "series_facet", marc_series_facet
|
|
59
|
+
end
|
|
60
|
+
output = @indexer.map_record(@record)
|
|
61
|
+
|
|
62
|
+
# trims punctuation too
|
|
63
|
+
assert_equal ["Big bands"], output["series_facet"]
|
|
64
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
65
|
+
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
describe "marc_sortable_author" do
|
|
69
|
+
# these probably should be taking only certain subfields, but we're copying
|
|
70
|
+
# from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
|
|
71
|
+
before do
|
|
72
|
+
@indexer.instance_eval do
|
|
73
|
+
to_field "author_sort", marc_sortable_author
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
it "collates author and title" do
|
|
77
|
+
output = @indexer.map_record(@record)
|
|
78
|
+
|
|
79
|
+
assert_equal ["Herman, Edward S. Manufacturing consent the political economy of the mass media Edward S. Herman and Noam Chomsky ; with a new introduction by the authors"], output["author_sort"]
|
|
80
|
+
assert_equal [""], @indexer.map_record(empty_record)['author_sort']
|
|
81
|
+
|
|
82
|
+
end
|
|
83
|
+
it "respects non-filing" do
|
|
84
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
|
85
|
+
|
|
86
|
+
output = @indexer.map_record(@record)
|
|
87
|
+
|
|
88
|
+
assert_equal ["Business renaissance quarterly [electronic resource]."], output["author_sort"]
|
|
89
|
+
assert_equal [""], @indexer.map_record(empty_record)['author_sort']
|
|
90
|
+
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
describe "marc_sortable_title" do
|
|
95
|
+
before do
|
|
96
|
+
@indexer.instance_eval { to_field "title_sort", marc_sortable_title }
|
|
97
|
+
end
|
|
98
|
+
it "works" do
|
|
99
|
+
output = @indexer.map_record(@record)
|
|
100
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title_sort"]
|
|
101
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
102
|
+
|
|
103
|
+
end
|
|
104
|
+
it "respects non-filing" do
|
|
105
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
|
106
|
+
output = @indexer.map_record(@record)
|
|
107
|
+
|
|
108
|
+
assert_equal ["Business renaissance quarterly"], output["title_sort"]
|
|
109
|
+
end
|
|
110
|
+
it "works with a record with no 245$ab" do
|
|
111
|
+
@record = MARC::Reader.new(support_file_path "245_no_ab.marc").to_a.first
|
|
112
|
+
output = @indexer.map_record(@record)
|
|
113
|
+
assert_equal ["Papers"], output["title_sort"]
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
describe "marc_languages" do
|
|
118
|
+
before do
|
|
119
|
+
@indexer.instance_eval {to_field "languages", marc_languages() }
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it "unpacks packed 041a and translates" do
|
|
123
|
+
@record = MARC::Reader.new(support_file_path "packed_041a_lang.marc").to_a.first
|
|
124
|
+
output = @indexer.map_record(@record)
|
|
125
|
+
|
|
126
|
+
assert_equal ["English", "French", "German", "Italian", "Spanish", "Russian"], output["languages"]
|
|
127
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
128
|
+
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
describe "marc_instrumentation_humanized" do
|
|
133
|
+
before do
|
|
134
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
|
135
|
+
@indexer.instance_eval {to_field "instrumentation", marc_instrumentation_humanized }
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "translates, de-duping" do
|
|
139
|
+
output = @indexer.map_record(@record)
|
|
140
|
+
|
|
141
|
+
assert_equal ["Larger ensemble, Unspecified", "Piano", "Soprano voice", "Tenor voice", "Violin", "Larger ensemble, Ethnic", "Guitar", "Voices, Unspecified"], output["instrumentation"]
|
|
142
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
143
|
+
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
describe "marc_instrument_codes_normalized" do
|
|
148
|
+
before do
|
|
149
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
|
150
|
+
@indexer.instance_eval {to_field "instrument_codes", marc_instrument_codes_normalized }
|
|
151
|
+
end
|
|
152
|
+
it "normalizes, de-duping" do
|
|
153
|
+
output = @indexer.map_record(@record)
|
|
154
|
+
|
|
155
|
+
assert_equal ["on", "ka01", "ka", "va01", "va", "vd01", "vd", "sa01", "sa", "oy", "tb01", "tb", "vn12", "vn"],
|
|
156
|
+
output["instrument_codes"]
|
|
157
|
+
end
|
|
158
|
+
it "codes soloist 048$b" do
|
|
159
|
+
@record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
|
|
160
|
+
output = @indexer.map_record(@record)
|
|
161
|
+
|
|
162
|
+
assert_equal ["bb01", "bb01.s", "bb", "bb.s", "oe"], output["instrument_codes"]
|
|
163
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
164
|
+
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
describe "publication_date" do
|
|
169
|
+
# there are way too many edge cases for us to test em all, but we'll test some of em.
|
|
170
|
+
|
|
171
|
+
it "works when there's no date information" do
|
|
172
|
+
assert_equal nil, Marc21Semantics.publication_date(empty_record)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "uses macro correctly with no date info" do
|
|
176
|
+
@indexer.instance_eval {to_field "date", marc_publication_date }
|
|
177
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
it "pulls out 008 date_type s" do
|
|
182
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
183
|
+
assert_equal 2002, Marc21Semantics.publication_date(@record)
|
|
184
|
+
|
|
185
|
+
end
|
|
186
|
+
it "uses start date for date_type c continuing resource" do
|
|
187
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
|
188
|
+
assert_equal 2006, Marc21Semantics.publication_date(@record)
|
|
189
|
+
end
|
|
190
|
+
it "returns nil when the records really got nothing" do
|
|
191
|
+
@record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
|
|
192
|
+
assert_equal nil, Marc21Semantics.publication_date(@record)
|
|
193
|
+
end
|
|
194
|
+
it "estimates with a single 'u'" do
|
|
195
|
+
@record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
|
|
196
|
+
# was 184u as date1 on a continuing resource. For continuing resources,
|
|
197
|
+
# we take the first date. And need to deal with the u.
|
|
198
|
+
assert_equal 1845, Marc21Semantics.publication_date(@record)
|
|
199
|
+
end
|
|
200
|
+
it "resorts to 260c" do
|
|
201
|
+
@record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
|
|
202
|
+
assert_equal 1980, Marc21Semantics.publication_date(@record)
|
|
203
|
+
end
|
|
204
|
+
it "works with date type r missing date2" do
|
|
205
|
+
@record = MARC::Reader.new(support_file_path "date_type_r_missing_date2.marc").to_a.first
|
|
206
|
+
assert_equal 1957, Marc21Semantics.publication_date(@record)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it "works correctly with date type 'q'" do
|
|
210
|
+
val = @record['008'].value
|
|
211
|
+
val[6] = 'q'
|
|
212
|
+
val[7..10] = '191u'
|
|
213
|
+
val[11..14] = '192u'
|
|
214
|
+
@record['008'].value = val
|
|
215
|
+
|
|
216
|
+
# Date should be date1 + date2 / 2 = (1910 + 1929) / 2 = 1919
|
|
217
|
+
estimate_tolerance = 30
|
|
218
|
+
assert_equal 1919, Marc21Semantics.publication_date(@record, estimate_tolerance)
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
describe "marc_lcc_to_broad_category" do
|
|
223
|
+
before do
|
|
224
|
+
@indexer.instance_eval {to_field "discipline_facet", marc_lcc_to_broad_category }
|
|
225
|
+
end
|
|
226
|
+
it "maps a simple example" do
|
|
227
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
228
|
+
output = @indexer.map_record(@record)
|
|
229
|
+
|
|
230
|
+
assert_equal ["Language & Literature"], output["discipline_facet"]
|
|
231
|
+
|
|
232
|
+
end
|
|
233
|
+
it "maps to default" do
|
|
234
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
|
235
|
+
output = @indexer.map_record(@record)
|
|
236
|
+
assert_equal ["Unknown"], output["discipline_facet"]
|
|
237
|
+
assert_equal(["Unknown"], @indexer.map_record(empty_record)['discipline_facet'])
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "maps to nothing if none and no default" do
|
|
241
|
+
@indexer.instance_eval {to_field "discipline_no_default", marc_lcc_to_broad_category(:default => nil)}
|
|
242
|
+
@record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
|
|
243
|
+
output = @indexer.map_record(@record)
|
|
244
|
+
|
|
245
|
+
assert_nil output["discipline_no_default"]
|
|
246
|
+
|
|
247
|
+
assert_nil @indexer.map_record(empty_record)["discipline_no_default"]
|
|
248
|
+
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
describe "LCC_REGEX" do
|
|
252
|
+
it "rejects a non-LCC" do
|
|
253
|
+
refute_match Traject::Macros::Marc21Semantics::LCC_REGEX, "Film no. A .N285"
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
describe "marc_geo_facet" do
|
|
260
|
+
before do
|
|
261
|
+
@indexer.instance_eval {to_field "geo_facet", marc_geo_facet }
|
|
262
|
+
end
|
|
263
|
+
it "maps a complicated record" do
|
|
264
|
+
@record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
|
|
265
|
+
output = @indexer.map_record(@record)
|
|
266
|
+
|
|
267
|
+
assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"], output["geo_facet"]
|
|
268
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
269
|
+
end
|
|
270
|
+
it "maps nothing on a record with no geo" do
|
|
271
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
272
|
+
output = @indexer.map_record(@record)
|
|
273
|
+
assert_nil output["geo_facet"]
|
|
274
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
275
|
+
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
describe "marc_era_facet" do
|
|
280
|
+
before do
|
|
281
|
+
@indexer.instance_eval {to_field "era_facet", marc_era_facet}
|
|
282
|
+
end
|
|
283
|
+
it "maps a complicated record" do
|
|
284
|
+
@record = MARC::Reader.new(support_file_path "multi_era.marc").to_a.first
|
|
285
|
+
output = @indexer.map_record(@record)
|
|
286
|
+
|
|
287
|
+
assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
|
|
288
|
+
output["era_facet"]
|
|
289
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
290
|
+
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
describe "marc_lcsh_display" do
|
|
295
|
+
it "formats typical field" do
|
|
296
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['z', 'England'], ['x', 'History'], ['y', '19th century.'])
|
|
297
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
|
298
|
+
|
|
299
|
+
assert_equal "Psychoanalysis and literature — England — History — 19th century", str
|
|
300
|
+
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it "ignores numeric subfields" do
|
|
304
|
+
field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['x', 'History'], ['0', '01234'], ['3', 'Some part'])
|
|
305
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
|
306
|
+
|
|
307
|
+
assert_equal "Psychoanalysis and literature — History", str
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
it "doesn't put subdivision in wrong place" do
|
|
311
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'])
|
|
312
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
|
313
|
+
|
|
314
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch", str
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
it "mixes non-subdivisions with subdivisions" do
|
|
318
|
+
field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'], ['x', 'Criticism.'])
|
|
319
|
+
str = Marc21Semantics.assemble_lcsh(field)
|
|
320
|
+
|
|
321
|
+
assert_equal "Eliot, George, 1819-1880. Middlemarch — Criticism", str
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
it "returns nil for a field with no relevant subfields" do
|
|
325
|
+
field = MARC::DataField.new('650', ' ', ' ')
|
|
326
|
+
assert_nil Marc21Semantics.assemble_lcsh(field)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
describe "marc_lcsh_formatted macro" do
|
|
330
|
+
it "smoke test" do
|
|
331
|
+
@record = MARC::Reader.new(support_file_path "george_eliot.marc").to_a.first
|
|
332
|
+
@indexer.instance_eval {to_field "lcsh", marc_lcsh_formatted}
|
|
333
|
+
output = @indexer.map_record(@record)
|
|
334
|
+
|
|
335
|
+
assert output["lcsh"].length > 0, "outputs data"
|
|
336
|
+
assert output["lcsh"].include?("Eliot, George, 1819-1880 — Characters"), "includes a string its supposed to"
|
|
337
|
+
|
|
338
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
339
|
+
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
describe "extract_marc_filing_version" do
|
|
345
|
+
before do
|
|
346
|
+
@record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it "works as expected" do
|
|
350
|
+
@indexer.instance_eval do
|
|
351
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab')
|
|
352
|
+
end
|
|
353
|
+
output = @indexer.map_record(@record)
|
|
354
|
+
assert_equal ['Business renaissance quarterly'], output['title_phrase']
|
|
355
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
356
|
+
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
it "works with :include_original" do
|
|
360
|
+
@indexer.instance_eval do
|
|
361
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
|
|
362
|
+
end
|
|
363
|
+
output = @indexer.map_record(@record)
|
|
364
|
+
assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
|
|
365
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
it "doesn't do anything if you don't include the first subfield" do
|
|
369
|
+
@indexer.instance_eval do
|
|
370
|
+
to_field 'title_phrase', extract_marc_filing_version('245h')
|
|
371
|
+
end
|
|
372
|
+
output = @indexer.map_record(@record)
|
|
373
|
+
assert_equal ['[electronic resource].'], output['title_phrase']
|
|
374
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
|
375
|
+
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
it "dies if you pass it something else" do
|
|
380
|
+
assert_raises(RuntimeError) do
|
|
381
|
+
@indexer.instance_eval do
|
|
382
|
+
to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true, :uniq => true)
|
|
383
|
+
end
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
end
|