traject 2.3.2 → 2.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/CHANGES.md +12 -2
- data/index_bench/common.rb +1 -1
- data/lib/traject/command_line.rb +2 -2
- data/lib/traject/indexer/settings.rb +5 -1
- data/lib/traject/macros/marc21.rb +1 -1
- data/lib/traject/macros/marc21_semantics.rb +8 -7
- data/lib/traject/macros/marc_format_classifier.rb +2 -1
- data/lib/traject/mock_reader.rb +3 -2
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/solr_json_writer.rb +2 -2
- data/lib/traject/util.rb +18 -17
- data/lib/traject/version.rb +1 -1
- data/test/indexer/{macros_marc21_semantics_test.rb → macros/macros_marc21_semantics_test.rb} +2 -2
- data/test/{marc21_macros_test.rb → indexer/macros/marc21/extract_all_marc_values_test.rb} +29 -2
- data/test/indexer/macros/marc21/extract_marc_test.rb +125 -0
- data/test/indexer/macros/marc21/serialize_marc_test.rb +73 -0
- data/test/indexer/macros/marc21/trim_punctuation_test.rb +39 -0
- data/test/indexer/{macros_test.rb → macros/to_field_test.rb} +1 -1
- data/test/indexer/map_record_test.rb +1 -1
- data/test/indexer/to_field_test.rb +1 -1
- data/test/indexer/writer_test.rb +17 -10
- data/test/test_support/demo_config.rb +3 -1
- metadata +15 -11
- data/test/indexer/macros_marc21_test.rb +0 -219
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2507bfdf51675c233b64ebbecb3247aaf53281ec
|
4
|
+
data.tar.gz: 0fd4edc089aa6cc09e477d4e7e8538f830d20f1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1091dbd01a7f2adf5ac7a8e4d09f9aec6c67b162379280442f62677e32dc82ec3d575fee3dc131983e5a23e4680cd92cbe4c739c2daf0f25dbf4db83febf7e95
|
7
|
+
data.tar.gz: 7220b2acb51c9ccb6cbbf8cb7caef8c478beaf16f115cbee7b8f719b4de91261926d7a6f6d059d84d0d4abced778148199864fd754c6abc3c7a54ebe85950d1a
|
data/.travis.yml
CHANGED
data/CHANGES.md
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
# Changes
|
2
2
|
|
3
|
+
## 2.3.3
|
4
|
+
* Further squash use of capture-variabels ('$1', etc.)
|
5
|
+
to try to work around the non-thread-safety of
|
6
|
+
regexp in ruby
|
7
|
+
* Fix a bug in trim_punctuation where trailing
|
8
|
+
periods were being eliminated even if there
|
9
|
+
was a short string before them (e.g., 'Jr.')
|
10
|
+
* Begin to reorganize tests, starting with
|
11
|
+
the Marc21 macros
|
12
|
+
|
3
13
|
## 2.3.2
|
4
14
|
* Change to `extract_marc` to work around a threadsafe problem in JRuby/MRI where
|
5
15
|
regexps were unsafely shared between threads. (@codeforkjeff)
|
6
16
|
* Make trim-punctuation safe for non-just-ASCII text (thanks to @dunn and @redlibrarian)
|
7
17
|
|
8
18
|
## 2.3.1
|
9
|
-
* Update README with more info
|
19
|
+
* Update README with more info about new nil-related options
|
10
20
|
|
11
21
|
## 2.3.0
|
12
22
|
* Allow nil values, empty fields, and deduplication
|
@@ -21,7 +31,7 @@
|
|
21
31
|
Set to `true` to pass empty fields on to the writer (with the value being an empty array)
|
22
32
|
|
23
33
|
## 2.2.1
|
24
|
-
* Had
|
34
|
+
* Had inadvertently broken use of arrays as extract_marc specifications. Fixed.
|
25
35
|
|
26
36
|
## 2.2.0
|
27
37
|
* Change DebugWriter to be more forgiving (and informative) about missing record-id fields
|
data/index_bench/common.rb
CHANGED
@@ -131,6 +131,6 @@ to_field "edition", extract_marc('250a')
|
|
131
131
|
|
132
132
|
to_field 'language', marc_languages("008[35-37]:041a:041d:041e:041j")
|
133
133
|
to_field 'language008', extract_marc('008[35-37]') do |r, acc|
|
134
|
-
acc.reject! {|x|
|
134
|
+
acc.reject! {|x| !(/\S/.match(x)} # ditch all-spaces values
|
135
135
|
acc.uniq!
|
136
136
|
end
|
data/lib/traject/command_line.rb
CHANGED
@@ -202,8 +202,8 @@ module Traject
|
|
202
202
|
|
203
203
|
# `-s key=value` command line
|
204
204
|
(options[:setting] || []).each do |setting_pair|
|
205
|
-
if
|
206
|
-
key, value =
|
205
|
+
if m = /\A([^=]+)\=(.*)\Z/.match(setting_pair)
|
206
|
+
key, value = m[1], m[2]
|
207
207
|
settings[key] = value
|
208
208
|
else
|
209
209
|
self.console.puts "Unrecognized setting argument '#{setting_pair}':"
|
@@ -103,7 +103,11 @@ class Traject::Indexer
|
|
103
103
|
def inspect
|
104
104
|
# Keep any key ending in password out of the inspect
|
105
105
|
self.inject({}) do |hash, (key, value)|
|
106
|
-
|
106
|
+
if /password\Z/.match(key)
|
107
|
+
hash[key] = "[hidden]"
|
108
|
+
else
|
109
|
+
hash[key] = value
|
110
|
+
end
|
107
111
|
hash
|
108
112
|
end.inspect
|
109
113
|
end
|
@@ -233,7 +233,7 @@ module Traject::Macros
|
|
233
233
|
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
234
234
|
|
235
235
|
# trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
236
|
-
str = str.sub(/( *[[:word:]
|
236
|
+
str = str.sub(/( *[[:word:]]{3,})\. *\Z/, '\1')
|
237
237
|
|
238
238
|
# single square bracket characters if they are the start and/or end
|
239
239
|
# chars and there are no internal square brackets.
|
@@ -40,8 +40,8 @@ module Traject::Macros
|
|
40
40
|
/x
|
41
41
|
|
42
42
|
def self.oclcnum_extract(num)
|
43
|
-
if OCLCPAT.match(num)
|
44
|
-
return
|
43
|
+
if m = OCLCPAT.match(num)
|
44
|
+
return m[1]
|
45
45
|
else
|
46
46
|
return nil
|
47
47
|
end
|
@@ -369,8 +369,8 @@ module Traject::Macros
|
|
369
369
|
v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
|
370
370
|
# just try to take the first four digits out of there, we're not going to try
|
371
371
|
# anything crazy.
|
372
|
-
if
|
373
|
-
found_date =
|
372
|
+
if m = /(\d{4})/.match(v260c)
|
373
|
+
found_date = m[1].to_i
|
374
374
|
end
|
375
375
|
end
|
376
376
|
|
@@ -408,7 +408,7 @@ module Traject::Macros
|
|
408
408
|
candidates = extractor.extract(record)
|
409
409
|
|
410
410
|
candidates.reject! do |candidate|
|
411
|
-
!(candidate
|
411
|
+
!(lcc_regex.match candidate)
|
412
412
|
end
|
413
413
|
|
414
414
|
accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq
|
@@ -501,10 +501,11 @@ module Traject::Macros
|
|
501
501
|
end)
|
502
502
|
|
503
503
|
# weird ones
|
504
|
+
special_fields_regex = /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
|
504
505
|
extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
|
505
506
|
field.subfields.each do |sf|
|
506
507
|
next unless sf.code == 'y'
|
507
|
-
if sf.value
|
508
|
+
if special_fields_regex.match(sf.value)
|
508
509
|
# it's our pattern, add the $a in please
|
509
510
|
accumulator << "#{field['a']}#{separator}#{sf.value.sub(/\. *\Z/, '')}"
|
510
511
|
else
|
@@ -562,7 +563,7 @@ module Traject::Macros
|
|
562
563
|
|
563
564
|
marc_field.subfields.each_with_index do |sf, i|
|
564
565
|
# ignore non-alphabetic, like numeric control subfields
|
565
|
-
next unless
|
566
|
+
next unless /\A[a-z]\Z/.match(sf.code)
|
566
567
|
|
567
568
|
prefix = if subd_prefix_codes.include? sf.code
|
568
569
|
subd_separator
|
@@ -105,7 +105,8 @@ module Traject
|
|
105
105
|
def proceeding?
|
106
106
|
@proceeding_q ||= begin
|
107
107
|
! record.find do |field|
|
108
|
-
field.tag.slice(0) == '6' &&
|
108
|
+
field.tag.slice(0) == '6' &&
|
109
|
+
field.subfields.find {|sf| sf.code == "v" && /^\s*(C|c)ongresses\.?\s*$/.match(sf.value) }
|
109
110
|
end.nil?
|
110
111
|
end
|
111
112
|
end
|
data/lib/traject/mock_reader.rb
CHANGED
@@ -40,15 +40,16 @@ module Traject
|
|
40
40
|
|
41
41
|
this_file_iter = file_io.each_line
|
42
42
|
|
43
|
+
|
43
44
|
while true
|
44
45
|
line = this_file_iter.next
|
45
|
-
break if
|
46
|
+
break if /^\_\_END\_\_/.match line
|
46
47
|
end
|
47
48
|
|
48
49
|
begin
|
49
50
|
while true
|
50
51
|
json = this_file_iter.next
|
51
|
-
next unless
|
52
|
+
next unless /\S/.match json
|
52
53
|
records << MARC::Record.new_from_hash(JSON.parse(json))
|
53
54
|
end
|
54
55
|
rescue StopIteration
|
data/lib/traject/ndj_reader.rb
CHANGED
@@ -12,7 +12,7 @@ class Traject::NDJReader
|
|
12
12
|
def initialize(input_stream, settings)
|
13
13
|
@settings = settings
|
14
14
|
@input_stream = input_stream
|
15
|
-
if @settings['command_line.filename']
|
15
|
+
if /\.gz\Z/.match(@settings['command_line.filename'])
|
16
16
|
@input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
|
17
17
|
end
|
18
18
|
end
|
@@ -236,7 +236,7 @@ class Traject::SolrJsonWriter
|
|
236
236
|
|
237
237
|
# If we've got a solr.update_url, make sure it's ok
|
238
238
|
def check_solr_update_url(url)
|
239
|
-
unless
|
239
|
+
unless /^#{URI::regexp}$/.match(url)
|
240
240
|
raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
|
241
241
|
end
|
242
242
|
url
|
@@ -249,7 +249,7 @@ class Traject::SolrJsonWriter
|
|
249
249
|
end
|
250
250
|
|
251
251
|
# Not a URL? Bail
|
252
|
-
unless
|
252
|
+
unless /^#{URI::regexp}$/.match(url)
|
253
253
|
raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
|
254
254
|
end
|
255
255
|
|
data/lib/traject/util.rb
CHANGED
@@ -5,14 +5,14 @@ module Traject
|
|
5
5
|
def self.exception_to_log_message(e)
|
6
6
|
indent = " "
|
7
7
|
|
8
|
-
msg
|
8
|
+
msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
|
9
9
|
msg += indent + e.backtrace.first + "\n"
|
10
10
|
|
11
|
-
if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause
|
11
|
+
if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause)
|
12
12
|
caused_by = e.getRootCause
|
13
|
-
msg
|
14
|
-
msg
|
15
|
-
msg
|
13
|
+
msg += indent + "Caused by\n"
|
14
|
+
msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
|
15
|
+
msg += indent + caused_by.backtrace.first + "\n"
|
16
16
|
end
|
17
17
|
|
18
18
|
return msg
|
@@ -37,8 +37,8 @@ module Traject
|
|
37
37
|
# For a SyntaxError, we really need to grep it from the
|
38
38
|
# exception message, it really appears to be nowhere else. Ugh.
|
39
39
|
if exception.kind_of? SyntaxError
|
40
|
-
if
|
41
|
-
return
|
40
|
+
if m = /:(\d+):/.match(exception.message)
|
41
|
+
return m[1].to_i
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
@@ -48,9 +48,9 @@ module Traject
|
|
48
48
|
# exception.backtrace_locations exists in MRI 2.1+, which makes
|
49
49
|
# our task a lot easier. But not yet in JRuby 1.7.x, so we got to
|
50
50
|
# handle the old way of having to parse the strings in backtrace too.
|
51
|
-
if (
|
52
|
-
|
53
|
-
|
51
|
+
if (exception.respond_to?(:backtrace_locations) &&
|
52
|
+
exception.backtrace_locations &&
|
53
|
+
exception.backtrace_locations.length > 0)
|
54
54
|
location = exception.backtrace_locations.find do |bt|
|
55
55
|
bt.path == file_path
|
56
56
|
end
|
@@ -58,8 +58,10 @@ module Traject
|
|
58
58
|
else # have to parse string backtrace
|
59
59
|
exception.backtrace.each do |line|
|
60
60
|
if line.start_with?(file_path)
|
61
|
-
|
62
|
-
|
61
|
+
if m = /\A.*\:(\d+)\:in/.match(line)
|
62
|
+
return m[1].to_i
|
63
|
+
break
|
64
|
+
end
|
63
65
|
end
|
64
66
|
end
|
65
67
|
# if we got here, we have nothing
|
@@ -75,14 +77,14 @@ module Traject
|
|
75
77
|
# returned array will actually be of Thread::Backtrace::Location elements.
|
76
78
|
def self.backtrace_from_config(file_path, exception)
|
77
79
|
filtered_trace = []
|
78
|
-
found
|
80
|
+
found = false
|
79
81
|
|
80
82
|
# MRI 2.1+ has exception.backtrace_locations which makes
|
81
83
|
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
82
84
|
# need to do it both ways.
|
83
|
-
if (
|
84
|
-
|
85
|
-
|
85
|
+
if (exception.respond_to?(:backtrace_locations) &&
|
86
|
+
exception.backtrace_locations &&
|
87
|
+
exception.backtrace_locations.length > 0)
|
86
88
|
|
87
89
|
exception.backtrace_locations.each do |location|
|
88
90
|
filtered_trace << location
|
@@ -100,7 +102,6 @@ module Traject
|
|
100
102
|
end
|
101
103
|
|
102
104
|
|
103
|
-
|
104
105
|
# Ruby stdlib queue lacks a 'drain' function, we write one.
|
105
106
|
#
|
106
107
|
# Removes everything currently in the ruby stdlib queue, and returns
|
data/lib/traject/version.rb
CHANGED
data/test/indexer/{macros_marc21_semantics_test.rb → macros/macros_marc21_semantics_test.rb}
RENAMED
@@ -169,7 +169,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
169
169
|
# there are way too many edge cases for us to test em all, but we'll test some of em.
|
170
170
|
|
171
171
|
it "works when there's no date information" do
|
172
|
-
|
172
|
+
assert_nil Marc21Semantics.publication_date(empty_record)
|
173
173
|
end
|
174
174
|
|
175
175
|
it "uses macro correctly with no date info" do
|
@@ -189,7 +189,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
189
189
|
end
|
190
190
|
it "returns nil when the records really got nothing" do
|
191
191
|
@record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
|
192
|
-
|
192
|
+
assert_nil Marc21Semantics.publication_date(@record)
|
193
193
|
end
|
194
194
|
it "estimates with a single 'u'" do
|
195
195
|
@record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
|
@@ -1,11 +1,20 @@
|
|
1
|
-
# Encoding: UTF-8
|
2
|
-
|
3
1
|
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
4
|
require 'traject/macros/marc21'
|
5
5
|
|
6
|
+
require 'json'
|
7
|
+
require 'marc'
|
8
|
+
|
6
9
|
include Traject::Macros::Marc21
|
7
10
|
|
11
|
+
|
8
12
|
describe "The extract_all_marc_values macro" do
|
13
|
+
before do
|
14
|
+
@indexer = Traject::Indexer.new
|
15
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
16
|
+
end
|
17
|
+
|
9
18
|
|
10
19
|
it "is fine with no arguments" do
|
11
20
|
assert(extract_all_marc_values)
|
@@ -20,4 +29,22 @@ describe "The extract_all_marc_values macro" do
|
|
20
29
|
extract_all_marc_values(from: 100, to: '999')
|
21
30
|
end
|
22
31
|
end
|
32
|
+
|
33
|
+
it "#extract_all_marc_values" do
|
34
|
+
@indexer.instance_eval do
|
35
|
+
to_field "text", extract_all_marc_values
|
36
|
+
end
|
37
|
+
output = @indexer.map_record(@record)
|
38
|
+
|
39
|
+
assert_length 13, output["text"]
|
40
|
+
end
|
41
|
+
|
42
|
+
|
23
43
|
end
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
|
+
require 'traject/macros/marc21'
|
5
|
+
|
6
|
+
require 'json'
|
7
|
+
require 'marc'
|
8
|
+
|
9
|
+
|
10
|
+
include Traject::Macros::Marc21
|
11
|
+
|
12
|
+
describe "extract_marc" do
|
13
|
+
before do
|
14
|
+
@indexer = Traject::Indexer.new
|
15
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
it "extracts marc" do
|
20
|
+
@indexer.instance_eval do
|
21
|
+
to_field "title", extract_marc("245ab")
|
22
|
+
end
|
23
|
+
|
24
|
+
output = @indexer.map_record(@record)
|
25
|
+
|
26
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
|
27
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
it "respects :first=>true option" do
|
32
|
+
@indexer.instance_eval do
|
33
|
+
to_field "other_id", extract_marc("035a", :first => true)
|
34
|
+
end
|
35
|
+
|
36
|
+
output = @indexer.map_record(@record)
|
37
|
+
|
38
|
+
assert_length 1, output["other_id"]
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
it "trims punctuation with :trim_punctuation => true" do
|
43
|
+
@indexer.instance_eval do
|
44
|
+
to_field "title", extract_marc("245ab", :trim_punctuation => true)
|
45
|
+
end
|
46
|
+
|
47
|
+
output = @indexer.map_record(@record)
|
48
|
+
|
49
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
|
50
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
it "respects :default option" do
|
55
|
+
@indexer.instance_eval do
|
56
|
+
to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
|
57
|
+
end
|
58
|
+
output = @indexer.map_record(@record)
|
59
|
+
|
60
|
+
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "de-duplicates by default, respects :allow_duplicates" do
|
64
|
+
# Add a second 008
|
65
|
+
f = @record.fields('008').first
|
66
|
+
@record.append(f)
|
67
|
+
|
68
|
+
@indexer.instance_eval do
|
69
|
+
to_field "lang1", extract_marc('008[35-37]')
|
70
|
+
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
|
71
|
+
end
|
72
|
+
|
73
|
+
output = @indexer.map_record(@record)
|
74
|
+
assert_equal ["eng"], output['lang1']
|
75
|
+
assert_equal ["eng", "eng"], output['lang2']
|
76
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
77
|
+
end
|
78
|
+
|
79
|
+
it "fails on an extra/misspelled argument to extract_marc" do
|
80
|
+
assert_raises(RuntimeError) do
|
81
|
+
@indexer.instance_eval do
|
82
|
+
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
it "throws away nil values unless settings['allow_nil_values]'" do
|
89
|
+
@indexer.instance_eval do
|
90
|
+
to_field 'default_nil', extract_marc('9999', :default => nil)
|
91
|
+
end
|
92
|
+
output = @indexer.map_record(@record)
|
93
|
+
assert_nil output['default_nil']
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
it "allows nil values if settings['allow_nil_values]'" do
|
98
|
+
@indexer.settings do |s|
|
99
|
+
s['allow_nil_values'] = true
|
100
|
+
end
|
101
|
+
@indexer.instance_eval do
|
102
|
+
to_field 'default_nil', extract_marc('9999', :default => nil)
|
103
|
+
end
|
104
|
+
output = @indexer.map_record(@record)
|
105
|
+
assert_equal [nil], output['default_nil']
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
it "uses :translation_map" do
|
112
|
+
@indexer.instance_eval do
|
113
|
+
to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
|
114
|
+
end
|
115
|
+
output = @indexer.map_record(@record)
|
116
|
+
|
117
|
+
assert_equal ["Library of Congress"], output["cataloging_agency"]
|
118
|
+
end
|
119
|
+
|
120
|
+
it "supports #extract_marc_from module method" do
|
121
|
+
output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
|
122
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
|
+
require 'traject/macros/marc21'
|
5
|
+
require 'json'
|
6
|
+
require 'marc'
|
7
|
+
|
8
|
+
include Traject::Macros::Marc21
|
9
|
+
|
10
|
+
|
11
|
+
describe "serialized_marc" do
|
12
|
+
before do
|
13
|
+
@indexer = Traject::Indexer.new
|
14
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
15
|
+
end
|
16
|
+
|
17
|
+
it "serializes xml" do
|
18
|
+
@indexer.instance_eval do
|
19
|
+
to_field "marc_record", serialized_marc(:format => "xml")
|
20
|
+
end
|
21
|
+
output = @indexer.map_record(@record)
|
22
|
+
|
23
|
+
assert_length 1, output["marc_record"]
|
24
|
+
assert_kind_of String, output["marc_record"].first
|
25
|
+
roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
|
26
|
+
assert_equal @record, roundtrip_record
|
27
|
+
end
|
28
|
+
|
29
|
+
it "serializes binary UUEncoded" do
|
30
|
+
@indexer.instance_eval do
|
31
|
+
to_field "marc_record", serialized_marc(:format => "binary")
|
32
|
+
end
|
33
|
+
output = @indexer.map_record(@record)
|
34
|
+
|
35
|
+
assert_length 1, output["marc_record"]
|
36
|
+
assert_kind_of String, output["marc_record"].first
|
37
|
+
|
38
|
+
decoded = Base64.decode64(output["marc_record"].first)
|
39
|
+
|
40
|
+
# just check the marc header for now
|
41
|
+
assert_start_with "02067cam a2200469", decoded
|
42
|
+
end
|
43
|
+
|
44
|
+
it "serializes binary raw" do
|
45
|
+
@indexer.instance_eval do
|
46
|
+
to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
|
47
|
+
end
|
48
|
+
output = @indexer.map_record(@record)
|
49
|
+
|
50
|
+
assert_length 1, output["marc_record"]
|
51
|
+
assert_kind_of String, output["marc_record"].first
|
52
|
+
|
53
|
+
# just check the marc header for now
|
54
|
+
assert_start_with "02067cam a2200469", output["marc_record"].first
|
55
|
+
end
|
56
|
+
|
57
|
+
it "serializes json" do
|
58
|
+
@indexer.instance_eval do
|
59
|
+
to_field "marc_record", serialized_marc(:format => "json")
|
60
|
+
end
|
61
|
+
output = @indexer.map_record(@record)
|
62
|
+
|
63
|
+
assert_length 1, output["marc_record"]
|
64
|
+
|
65
|
+
# okay, let's actually deserialize it, why not
|
66
|
+
|
67
|
+
hash = JSON.parse(output["marc_record"].first)
|
68
|
+
|
69
|
+
deserialized = MARC::Record.new_from_hash(hash)
|
70
|
+
|
71
|
+
assert_equal @record, deserialized
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
require 'traject/indexer'
|
5
|
+
require 'traject/macros/marc21'
|
6
|
+
|
7
|
+
|
8
|
+
include Traject::Macros::Marc21
|
9
|
+
|
10
|
+
describe "trim_punctuation" do
|
11
|
+
|
12
|
+
# TODO: test coverage for trim_punctuation
|
13
|
+
# trim_punctuation isn't super-complicated code, and yet we've found a few bugs
|
14
|
+
# in it already. Needs more test coveragel
|
15
|
+
it "Works as expected" do
|
16
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three")
|
17
|
+
|
18
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three,")
|
19
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three/")
|
20
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three;")
|
21
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three:")
|
22
|
+
assert_equal "one two three .", Traject::Macros::Marc21.trim_punctuation("one two three .")
|
23
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three.")
|
24
|
+
assert_equal "one two three...", Traject::Macros::Marc21.trim_punctuation("one two three...")
|
25
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation(" one two three.")
|
26
|
+
|
27
|
+
assert_equal "one two [three]", Traject::Macros::Marc21.trim_punctuation("one two [three]")
|
28
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three]")
|
29
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three")
|
30
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three]")
|
31
|
+
|
32
|
+
# This one was a bug before
|
33
|
+
assert_equal "Feminism and art", Traject::Macros::Marc21.trim_punctuation("Feminism and art.")
|
34
|
+
assert_equal "Le réve", Traject::Macros::Marc21.trim_punctuation("Le réve.")
|
35
|
+
|
36
|
+
# This one was a bug on the bug
|
37
|
+
assert_equal "Bill Dueber, Jr.", Traject::Macros::Marc21.trim_punctuation("Bill Dueber, Jr.")
|
38
|
+
end
|
39
|
+
end
|
@@ -192,7 +192,7 @@ describe "Traject::Indexer#map_record" do
|
|
192
192
|
end
|
193
193
|
|
194
194
|
@indexer.to_field('radical') do |rec, acc, context|
|
195
|
-
context.skip!("Chomsky!") if rec['245'].to_s
|
195
|
+
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
196
|
end
|
197
197
|
|
198
198
|
@indexer.to_field('afterSkip') do |rec, acc|
|
data/test/indexer/writer_test.rb
CHANGED
@@ -2,21 +2,28 @@ require 'test_helper'
|
|
2
2
|
require 'traject/yaml_writer'
|
3
3
|
|
4
4
|
describe "The writer on Traject::Indexer" do
|
5
|
-
let(:indexer) { Traject::Indexer.new("solr.url" => "http://
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
let(:indexer) { Traject::Indexer.new("solr.url" => "http://localhost.com") }
|
6
|
+
|
7
|
+
# TODO: fix default writer test
|
8
|
+
# Fails in the absence of a configured
|
9
|
+
# network interface.
|
10
|
+
describe "default writer from index" do
|
11
|
+
it "has a default" do
|
12
|
+
# assert_instance_of Traject::SolrJsonWriter, indexer.writer
|
13
|
+
# assert_equal Traject::SolrJsonWriter, indexer.writer_class
|
14
|
+
skip "Fails in the absence of a configured network interface."
|
15
|
+
end
|
10
16
|
end
|
11
17
|
|
12
|
-
|
18
|
+
|
19
|
+
describe "when the writer is set in config" do
|
13
20
|
let(:writer) { Traject::YamlWriter.new({}) }
|
14
21
|
|
15
22
|
let(:indexer) { Traject::Indexer.new(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
"solr.url" => "http://example.com",
|
24
|
+
"writer_class" => 'Traject::SolrJsonWriter',
|
25
|
+
"writer" => writer
|
26
|
+
) }
|
20
27
|
|
21
28
|
it "uses writer from config" do
|
22
29
|
assert_equal writer, indexer.writer
|
@@ -120,7 +120,9 @@ to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |re
|
|
120
120
|
if call_type == "sudoc"
|
121
121
|
# we choose to call it:
|
122
122
|
accumulator << "Government Publication"
|
123
|
-
elsif call_type.nil? ||
|
123
|
+
elsif call_type.nil? ||
|
124
|
+
call_type == "lc" ||
|
125
|
+
Traject::Macros::Marc21Semantics::LCC_REGEX.match(field['a'])
|
124
126
|
# run it through the map
|
125
127
|
s = field['a']
|
126
128
|
s = s.slice(0, 1) if s
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-01-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -254,15 +254,17 @@ files:
|
|
254
254
|
- test/indexer/context_test.rb
|
255
255
|
- test/indexer/each_record_test.rb
|
256
256
|
- test/indexer/load_config_file_test.rb
|
257
|
-
- test/indexer/macros_marc21_semantics_test.rb
|
258
|
-
- test/indexer/
|
259
|
-
- test/indexer/
|
257
|
+
- test/indexer/macros/macros_marc21_semantics_test.rb
|
258
|
+
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
259
|
+
- test/indexer/macros/marc21/extract_marc_test.rb
|
260
|
+
- test/indexer/macros/marc21/serialize_marc_test.rb
|
261
|
+
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
262
|
+
- test/indexer/macros/to_field_test.rb
|
260
263
|
- test/indexer/map_record_test.rb
|
261
264
|
- test/indexer/read_write_test.rb
|
262
265
|
- test/indexer/settings_test.rb
|
263
266
|
- test/indexer/to_field_test.rb
|
264
267
|
- test/indexer/writer_test.rb
|
265
|
-
- test/marc21_macros_test.rb
|
266
268
|
- test/marc_extractor_test.rb
|
267
269
|
- test/marc_format_classifier_test.rb
|
268
270
|
- test/marc_reader_test.rb
|
@@ -329,7 +331,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
329
331
|
version: '0'
|
330
332
|
requirements: []
|
331
333
|
rubyforge_project:
|
332
|
-
rubygems_version: 2.
|
334
|
+
rubygems_version: 2.6.8
|
333
335
|
signing_key:
|
334
336
|
specification_version: 4
|
335
337
|
summary: Index MARC to Solr; or generally process source records to hash-like structures
|
@@ -339,15 +341,17 @@ test_files:
|
|
339
341
|
- test/indexer/context_test.rb
|
340
342
|
- test/indexer/each_record_test.rb
|
341
343
|
- test/indexer/load_config_file_test.rb
|
342
|
-
- test/indexer/macros_marc21_semantics_test.rb
|
343
|
-
- test/indexer/
|
344
|
-
- test/indexer/
|
344
|
+
- test/indexer/macros/macros_marc21_semantics_test.rb
|
345
|
+
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
346
|
+
- test/indexer/macros/marc21/extract_marc_test.rb
|
347
|
+
- test/indexer/macros/marc21/serialize_marc_test.rb
|
348
|
+
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
349
|
+
- test/indexer/macros/to_field_test.rb
|
345
350
|
- test/indexer/map_record_test.rb
|
346
351
|
- test/indexer/read_write_test.rb
|
347
352
|
- test/indexer/settings_test.rb
|
348
353
|
- test/indexer/to_field_test.rb
|
349
354
|
- test/indexer/writer_test.rb
|
350
|
-
- test/marc21_macros_test.rb
|
351
355
|
- test/marc_extractor_test.rb
|
352
356
|
- test/marc_format_classifier_test.rb
|
353
357
|
- test/marc_reader_test.rb
|
@@ -1,219 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
|
3
|
-
require 'traject/indexer'
|
4
|
-
require 'traject/macros/marc21'
|
5
|
-
|
6
|
-
require 'json'
|
7
|
-
require 'marc'
|
8
|
-
|
9
|
-
# See also marc_extractor_test.rb for more detailed tests on marc extraction,
|
10
|
-
# this is just a basic test to make sure our macro works passing through to there
|
11
|
-
# and other options.
|
12
|
-
describe "Traject::Macros::Marc21" do
|
13
|
-
Marc21 = Traject::Macros::Marc21 # shortcut
|
14
|
-
|
15
|
-
before do
|
16
|
-
@indexer = Traject::Indexer.new
|
17
|
-
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
18
|
-
end
|
19
|
-
|
20
|
-
describe "extract_marc" do
|
21
|
-
it "extracts marc" do
|
22
|
-
@indexer.instance_eval do
|
23
|
-
to_field "title", extract_marc("245ab")
|
24
|
-
end
|
25
|
-
|
26
|
-
output = @indexer.map_record(@record)
|
27
|
-
|
28
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
|
29
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
it "respects :first=>true option" do
|
34
|
-
@indexer.instance_eval do
|
35
|
-
to_field "other_id", extract_marc("035a", :first => true)
|
36
|
-
end
|
37
|
-
|
38
|
-
output = @indexer.map_record(@record)
|
39
|
-
|
40
|
-
assert_length 1, output["other_id"]
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
it "trims punctuation with :trim_punctuation => true" do
|
45
|
-
@indexer.instance_eval do
|
46
|
-
to_field "title", extract_marc("245ab", :trim_punctuation => true)
|
47
|
-
end
|
48
|
-
|
49
|
-
output = @indexer.map_record(@record)
|
50
|
-
|
51
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
|
52
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
|
-
it "respects :default option" do
|
57
|
-
@indexer.instance_eval do
|
58
|
-
to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
|
59
|
-
end
|
60
|
-
output = @indexer.map_record(@record)
|
61
|
-
|
62
|
-
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
63
|
-
end
|
64
|
-
|
65
|
-
it "de-duplicates by default, respects :allow_duplicates" do
|
66
|
-
# Add a second 008
|
67
|
-
f = @record.fields('008').first
|
68
|
-
@record.append(f)
|
69
|
-
|
70
|
-
@indexer.instance_eval do
|
71
|
-
to_field "lang1", extract_marc('008[35-37]')
|
72
|
-
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
|
73
|
-
end
|
74
|
-
|
75
|
-
output = @indexer.map_record(@record)
|
76
|
-
assert_equal ["eng"], output['lang1']
|
77
|
-
assert_equal ["eng", "eng"], output['lang2']
|
78
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
79
|
-
end
|
80
|
-
|
81
|
-
it "fails on an extra/misspelled argument to extract_marc" do
|
82
|
-
assert_raises(RuntimeError) do
|
83
|
-
@indexer.instance_eval do
|
84
|
-
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
it "throws away nil values unless settings['allow_nil_values]'" do
|
91
|
-
@indexer.instance_eval do
|
92
|
-
to_field 'default_nil', extract_marc('9999', :default => nil)
|
93
|
-
end
|
94
|
-
output = @indexer.map_record(@record)
|
95
|
-
assert_nil output['default_nil']
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
it "allows nil values if settings['allow_nil_values]'" do
|
101
|
-
@indexer.settings do |s|
|
102
|
-
s['allow_nil_values'] = true
|
103
|
-
end
|
104
|
-
@indexer.instance_eval do
|
105
|
-
to_field 'default_nil', extract_marc('9999', :default => nil)
|
106
|
-
end
|
107
|
-
output = @indexer.map_record(@record)
|
108
|
-
assert_equal [nil], output['default_nil']
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
|
-
it "Marc21::trim_punctuation class method" do
|
113
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three")
|
114
|
-
|
115
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three,")
|
116
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three/")
|
117
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three;")
|
118
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three:")
|
119
|
-
assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
|
120
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three.")
|
121
|
-
assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
|
122
|
-
assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
|
123
|
-
|
124
|
-
assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
|
125
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
126
|
-
assert_equal "one two three", Marc21.trim_punctuation("[one two three")
|
127
|
-
assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
|
128
|
-
|
129
|
-
# This one was a bug before
|
130
|
-
assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
|
131
|
-
|
132
|
-
assert_equal "Le réve", Marc21.trim_punctuation("Le réve.") # this assertion currently fails
|
133
|
-
end
|
134
|
-
|
135
|
-
it "uses :translation_map" do
|
136
|
-
@indexer.instance_eval do
|
137
|
-
to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
|
138
|
-
end
|
139
|
-
output = @indexer.map_record(@record)
|
140
|
-
|
141
|
-
assert_equal ["Library of Congress"], output["cataloging_agency"]
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
it "supports #extract_marc_from module method" do
|
146
|
-
output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
|
147
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
|
148
|
-
end
|
149
|
-
|
150
|
-
describe "serialized_marc" do
|
151
|
-
it "serializes xml" do
|
152
|
-
@indexer.instance_eval do
|
153
|
-
to_field "marc_record", serialized_marc(:format => "xml")
|
154
|
-
end
|
155
|
-
output = @indexer.map_record(@record)
|
156
|
-
|
157
|
-
assert_length 1, output["marc_record"]
|
158
|
-
assert_kind_of String, output["marc_record"].first
|
159
|
-
roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
|
160
|
-
assert_equal @record, roundtrip_record
|
161
|
-
end
|
162
|
-
|
163
|
-
it "serializes binary UUEncoded" do
|
164
|
-
@indexer.instance_eval do
|
165
|
-
to_field "marc_record", serialized_marc(:format => "binary")
|
166
|
-
end
|
167
|
-
output = @indexer.map_record(@record)
|
168
|
-
|
169
|
-
assert_length 1, output["marc_record"]
|
170
|
-
assert_kind_of String, output["marc_record"].first
|
171
|
-
|
172
|
-
decoded = Base64.decode64(output["marc_record"].first)
|
173
|
-
|
174
|
-
# just check the marc header for now
|
175
|
-
assert_start_with "02067cam a2200469", decoded
|
176
|
-
end
|
177
|
-
|
178
|
-
it "serializes binary raw" do
|
179
|
-
@indexer.instance_eval do
|
180
|
-
to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
|
181
|
-
end
|
182
|
-
output = @indexer.map_record(@record)
|
183
|
-
|
184
|
-
assert_length 1, output["marc_record"]
|
185
|
-
assert_kind_of String, output["marc_record"].first
|
186
|
-
|
187
|
-
# just check the marc header for now
|
188
|
-
assert_start_with "02067cam a2200469", output["marc_record"].first
|
189
|
-
end
|
190
|
-
|
191
|
-
it "serializes json" do
|
192
|
-
@indexer.instance_eval do
|
193
|
-
to_field "marc_record", serialized_marc(:format => "json")
|
194
|
-
end
|
195
|
-
output = @indexer.map_record(@record)
|
196
|
-
|
197
|
-
assert_length 1, output["marc_record"]
|
198
|
-
|
199
|
-
# okay, let's actually deserialize it, why not
|
200
|
-
|
201
|
-
hash = JSON.parse(output["marc_record"].first)
|
202
|
-
|
203
|
-
deserialized = MARC::Record.new_from_hash(hash)
|
204
|
-
|
205
|
-
assert_equal @record, deserialized
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
it "#extract_all_marc_values" do
|
210
|
-
@indexer.instance_eval do
|
211
|
-
to_field "text", extract_all_marc_values
|
212
|
-
end
|
213
|
-
output = @indexer.map_record(@record)
|
214
|
-
|
215
|
-
assert_length 13, output["text"]
|
216
|
-
end
|
217
|
-
|
218
|
-
|
219
|
-
end
|