traject 2.3.2-java → 2.3.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/CHANGES.md +12 -2
- data/index_bench/common.rb +1 -1
- data/lib/traject/command_line.rb +2 -2
- data/lib/traject/indexer/settings.rb +5 -1
- data/lib/traject/macros/marc21.rb +1 -1
- data/lib/traject/macros/marc21_semantics.rb +8 -7
- data/lib/traject/macros/marc_format_classifier.rb +2 -1
- data/lib/traject/mock_reader.rb +3 -2
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/solr_json_writer.rb +2 -2
- data/lib/traject/util.rb +18 -17
- data/lib/traject/version.rb +1 -1
- data/test/indexer/{macros_marc21_semantics_test.rb → macros/macros_marc21_semantics_test.rb} +2 -2
- data/test/{marc21_macros_test.rb → indexer/macros/marc21/extract_all_marc_values_test.rb} +29 -2
- data/test/indexer/macros/marc21/extract_marc_test.rb +125 -0
- data/test/indexer/macros/marc21/serialize_marc_test.rb +73 -0
- data/test/indexer/macros/marc21/trim_punctuation_test.rb +39 -0
- data/test/indexer/{macros_test.rb → macros/to_field_test.rb} +1 -1
- data/test/indexer/map_record_test.rb +1 -1
- data/test/indexer/to_field_test.rb +1 -1
- data/test/indexer/writer_test.rb +17 -10
- data/test/test_support/demo_config.rb +3 -1
- metadata +15 -11
- data/test/indexer/macros_marc21_test.rb +0 -219
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 203a69835854bde2665c53aafaea20ceb731f431
|
4
|
+
data.tar.gz: 38579221ab60f0db3a05bec41e367ff6e06c4153
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26969d8a05a35dbf5fd2d59391f4701353af387c14641a99e43beb32d9ee07de8ed6b0a1709091f03cc1db474ac4a24203969fc2a114d11edacc7e2f0e91963e
|
7
|
+
data.tar.gz: 6129937d2a7b6958368487ed38174458740d476c9513cc7f497b81f882aca9f6b61e36034ddbe59bfa9dacef5575bf7b5123702dc1f5c22072255eeba65afad2
|
data/.travis.yml
CHANGED
data/CHANGES.md
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
# Changes
|
2
2
|
|
3
|
+
## 2.3.3
|
4
|
+
* Further squash use of capture-variabels ('$1', etc.)
|
5
|
+
to try to work around the non-thread-safety of
|
6
|
+
regexp in ruby
|
7
|
+
* Fix a bug in trim_punctuation where trailing
|
8
|
+
periods were being eliminated even if there
|
9
|
+
was a short string before them (e.g., 'Jr.')
|
10
|
+
* Begin to reorganize tests, starting with
|
11
|
+
the Marc21 macros
|
12
|
+
|
3
13
|
## 2.3.2
|
4
14
|
* Change to `extract_marc` to work around a threadsafe problem in JRuby/MRI where
|
5
15
|
regexps were unsafely shared between threads. (@codeforkjeff)
|
6
16
|
* Make trim-punctuation safe for non-just-ASCII text (thanks to @dunn and @redlibrarian)
|
7
17
|
|
8
18
|
## 2.3.1
|
9
|
-
* Update README with more info
|
19
|
+
* Update README with more info about new nil-related options
|
10
20
|
|
11
21
|
## 2.3.0
|
12
22
|
* Allow nil values, empty fields, and deduplication
|
@@ -21,7 +31,7 @@
|
|
21
31
|
Set to `true` to pass empty fields on to the writer (with the value being an empty array)
|
22
32
|
|
23
33
|
## 2.2.1
|
24
|
-
* Had
|
34
|
+
* Had inadvertently broken use of arrays as extract_marc specifications. Fixed.
|
25
35
|
|
26
36
|
## 2.2.0
|
27
37
|
* Change DebugWriter to be more forgiving (and informative) about missing record-id fields
|
data/index_bench/common.rb
CHANGED
@@ -131,6 +131,6 @@ to_field "edition", extract_marc('250a')
|
|
131
131
|
|
132
132
|
to_field 'language', marc_languages("008[35-37]:041a:041d:041e:041j")
|
133
133
|
to_field 'language008', extract_marc('008[35-37]') do |r, acc|
|
134
|
-
acc.reject! {|x|
|
134
|
+
acc.reject! {|x| !(/\S/.match(x)} # ditch all-spaces values
|
135
135
|
acc.uniq!
|
136
136
|
end
|
data/lib/traject/command_line.rb
CHANGED
@@ -202,8 +202,8 @@ module Traject
|
|
202
202
|
|
203
203
|
# `-s key=value` command line
|
204
204
|
(options[:setting] || []).each do |setting_pair|
|
205
|
-
if
|
206
|
-
key, value =
|
205
|
+
if m = /\A([^=]+)\=(.*)\Z/.match(setting_pair)
|
206
|
+
key, value = m[1], m[2]
|
207
207
|
settings[key] = value
|
208
208
|
else
|
209
209
|
self.console.puts "Unrecognized setting argument '#{setting_pair}':"
|
@@ -103,7 +103,11 @@ class Traject::Indexer
|
|
103
103
|
def inspect
|
104
104
|
# Keep any key ending in password out of the inspect
|
105
105
|
self.inject({}) do |hash, (key, value)|
|
106
|
-
|
106
|
+
if /password\Z/.match(key)
|
107
|
+
hash[key] = "[hidden]"
|
108
|
+
else
|
109
|
+
hash[key] = value
|
110
|
+
end
|
107
111
|
hash
|
108
112
|
end.inspect
|
109
113
|
end
|
@@ -233,7 +233,7 @@ module Traject::Macros
|
|
233
233
|
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
234
234
|
|
235
235
|
# trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
236
|
-
str = str.sub(/( *[[:word:]
|
236
|
+
str = str.sub(/( *[[:word:]]{3,})\. *\Z/, '\1')
|
237
237
|
|
238
238
|
# single square bracket characters if they are the start and/or end
|
239
239
|
# chars and there are no internal square brackets.
|
@@ -40,8 +40,8 @@ module Traject::Macros
|
|
40
40
|
/x
|
41
41
|
|
42
42
|
def self.oclcnum_extract(num)
|
43
|
-
if OCLCPAT.match(num)
|
44
|
-
return
|
43
|
+
if m = OCLCPAT.match(num)
|
44
|
+
return m[1]
|
45
45
|
else
|
46
46
|
return nil
|
47
47
|
end
|
@@ -369,8 +369,8 @@ module Traject::Macros
|
|
369
369
|
v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
|
370
370
|
# just try to take the first four digits out of there, we're not going to try
|
371
371
|
# anything crazy.
|
372
|
-
if
|
373
|
-
found_date =
|
372
|
+
if m = /(\d{4})/.match(v260c)
|
373
|
+
found_date = m[1].to_i
|
374
374
|
end
|
375
375
|
end
|
376
376
|
|
@@ -408,7 +408,7 @@ module Traject::Macros
|
|
408
408
|
candidates = extractor.extract(record)
|
409
409
|
|
410
410
|
candidates.reject! do |candidate|
|
411
|
-
!(candidate
|
411
|
+
!(lcc_regex.match candidate)
|
412
412
|
end
|
413
413
|
|
414
414
|
accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq
|
@@ -501,10 +501,11 @@ module Traject::Macros
|
|
501
501
|
end)
|
502
502
|
|
503
503
|
# weird ones
|
504
|
+
special_fields_regex = /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
|
504
505
|
extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
|
505
506
|
field.subfields.each do |sf|
|
506
507
|
next unless sf.code == 'y'
|
507
|
-
if sf.value
|
508
|
+
if special_fields_regex.match(sf.value)
|
508
509
|
# it's our pattern, add the $a in please
|
509
510
|
accumulator << "#{field['a']}#{separator}#{sf.value.sub(/\. *\Z/, '')}"
|
510
511
|
else
|
@@ -562,7 +563,7 @@ module Traject::Macros
|
|
562
563
|
|
563
564
|
marc_field.subfields.each_with_index do |sf, i|
|
564
565
|
# ignore non-alphabetic, like numeric control subfields
|
565
|
-
next unless
|
566
|
+
next unless /\A[a-z]\Z/.match(sf.code)
|
566
567
|
|
567
568
|
prefix = if subd_prefix_codes.include? sf.code
|
568
569
|
subd_separator
|
@@ -105,7 +105,8 @@ module Traject
|
|
105
105
|
def proceeding?
|
106
106
|
@proceeding_q ||= begin
|
107
107
|
! record.find do |field|
|
108
|
-
field.tag.slice(0) == '6' &&
|
108
|
+
field.tag.slice(0) == '6' &&
|
109
|
+
field.subfields.find {|sf| sf.code == "v" && /^\s*(C|c)ongresses\.?\s*$/.match(sf.value) }
|
109
110
|
end.nil?
|
110
111
|
end
|
111
112
|
end
|
data/lib/traject/mock_reader.rb
CHANGED
@@ -40,15 +40,16 @@ module Traject
|
|
40
40
|
|
41
41
|
this_file_iter = file_io.each_line
|
42
42
|
|
43
|
+
|
43
44
|
while true
|
44
45
|
line = this_file_iter.next
|
45
|
-
break if
|
46
|
+
break if /^\_\_END\_\_/.match line
|
46
47
|
end
|
47
48
|
|
48
49
|
begin
|
49
50
|
while true
|
50
51
|
json = this_file_iter.next
|
51
|
-
next unless
|
52
|
+
next unless /\S/.match json
|
52
53
|
records << MARC::Record.new_from_hash(JSON.parse(json))
|
53
54
|
end
|
54
55
|
rescue StopIteration
|
data/lib/traject/ndj_reader.rb
CHANGED
@@ -12,7 +12,7 @@ class Traject::NDJReader
|
|
12
12
|
def initialize(input_stream, settings)
|
13
13
|
@settings = settings
|
14
14
|
@input_stream = input_stream
|
15
|
-
if @settings['command_line.filename']
|
15
|
+
if /\.gz\Z/.match(@settings['command_line.filename'])
|
16
16
|
@input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
|
17
17
|
end
|
18
18
|
end
|
@@ -236,7 +236,7 @@ class Traject::SolrJsonWriter
|
|
236
236
|
|
237
237
|
# If we've got a solr.update_url, make sure it's ok
|
238
238
|
def check_solr_update_url(url)
|
239
|
-
unless
|
239
|
+
unless /^#{URI::regexp}$/.match(url)
|
240
240
|
raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
|
241
241
|
end
|
242
242
|
url
|
@@ -249,7 +249,7 @@ class Traject::SolrJsonWriter
|
|
249
249
|
end
|
250
250
|
|
251
251
|
# Not a URL? Bail
|
252
|
-
unless
|
252
|
+
unless /^#{URI::regexp}$/.match(url)
|
253
253
|
raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
|
254
254
|
end
|
255
255
|
|
data/lib/traject/util.rb
CHANGED
@@ -5,14 +5,14 @@ module Traject
|
|
5
5
|
def self.exception_to_log_message(e)
|
6
6
|
indent = " "
|
7
7
|
|
8
|
-
msg
|
8
|
+
msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
|
9
9
|
msg += indent + e.backtrace.first + "\n"
|
10
10
|
|
11
|
-
if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause
|
11
|
+
if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause)
|
12
12
|
caused_by = e.getRootCause
|
13
|
-
msg
|
14
|
-
msg
|
15
|
-
msg
|
13
|
+
msg += indent + "Caused by\n"
|
14
|
+
msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
|
15
|
+
msg += indent + caused_by.backtrace.first + "\n"
|
16
16
|
end
|
17
17
|
|
18
18
|
return msg
|
@@ -37,8 +37,8 @@ module Traject
|
|
37
37
|
# For a SyntaxError, we really need to grep it from the
|
38
38
|
# exception message, it really appears to be nowhere else. Ugh.
|
39
39
|
if exception.kind_of? SyntaxError
|
40
|
-
if
|
41
|
-
return
|
40
|
+
if m = /:(\d+):/.match(exception.message)
|
41
|
+
return m[1].to_i
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
@@ -48,9 +48,9 @@ module Traject
|
|
48
48
|
# exception.backtrace_locations exists in MRI 2.1+, which makes
|
49
49
|
# our task a lot easier. But not yet in JRuby 1.7.x, so we got to
|
50
50
|
# handle the old way of having to parse the strings in backtrace too.
|
51
|
-
if (
|
52
|
-
|
53
|
-
|
51
|
+
if (exception.respond_to?(:backtrace_locations) &&
|
52
|
+
exception.backtrace_locations &&
|
53
|
+
exception.backtrace_locations.length > 0)
|
54
54
|
location = exception.backtrace_locations.find do |bt|
|
55
55
|
bt.path == file_path
|
56
56
|
end
|
@@ -58,8 +58,10 @@ module Traject
|
|
58
58
|
else # have to parse string backtrace
|
59
59
|
exception.backtrace.each do |line|
|
60
60
|
if line.start_with?(file_path)
|
61
|
-
|
62
|
-
|
61
|
+
if m = /\A.*\:(\d+)\:in/.match(line)
|
62
|
+
return m[1].to_i
|
63
|
+
break
|
64
|
+
end
|
63
65
|
end
|
64
66
|
end
|
65
67
|
# if we got here, we have nothing
|
@@ -75,14 +77,14 @@ module Traject
|
|
75
77
|
# returned array will actually be of Thread::Backtrace::Location elements.
|
76
78
|
def self.backtrace_from_config(file_path, exception)
|
77
79
|
filtered_trace = []
|
78
|
-
found
|
80
|
+
found = false
|
79
81
|
|
80
82
|
# MRI 2.1+ has exception.backtrace_locations which makes
|
81
83
|
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
82
84
|
# need to do it both ways.
|
83
|
-
if (
|
84
|
-
|
85
|
-
|
85
|
+
if (exception.respond_to?(:backtrace_locations) &&
|
86
|
+
exception.backtrace_locations &&
|
87
|
+
exception.backtrace_locations.length > 0)
|
86
88
|
|
87
89
|
exception.backtrace_locations.each do |location|
|
88
90
|
filtered_trace << location
|
@@ -100,7 +102,6 @@ module Traject
|
|
100
102
|
end
|
101
103
|
|
102
104
|
|
103
|
-
|
104
105
|
# Ruby stdlib queue lacks a 'drain' function, we write one.
|
105
106
|
#
|
106
107
|
# Removes everything currently in the ruby stdlib queue, and returns
|
data/lib/traject/version.rb
CHANGED
data/test/indexer/{macros_marc21_semantics_test.rb → macros/macros_marc21_semantics_test.rb}
RENAMED
@@ -169,7 +169,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
169
169
|
# there are way too many edge cases for us to test em all, but we'll test some of em.
|
170
170
|
|
171
171
|
it "works when there's no date information" do
|
172
|
-
|
172
|
+
assert_nil Marc21Semantics.publication_date(empty_record)
|
173
173
|
end
|
174
174
|
|
175
175
|
it "uses macro correctly with no date info" do
|
@@ -189,7 +189,7 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
189
189
|
end
|
190
190
|
it "returns nil when the records really got nothing" do
|
191
191
|
@record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
|
192
|
-
|
192
|
+
assert_nil Marc21Semantics.publication_date(@record)
|
193
193
|
end
|
194
194
|
it "estimates with a single 'u'" do
|
195
195
|
@record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
|
@@ -1,11 +1,20 @@
|
|
1
|
-
# Encoding: UTF-8
|
2
|
-
|
3
1
|
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
4
|
require 'traject/macros/marc21'
|
5
5
|
|
6
|
+
require 'json'
|
7
|
+
require 'marc'
|
8
|
+
|
6
9
|
include Traject::Macros::Marc21
|
7
10
|
|
11
|
+
|
8
12
|
describe "The extract_all_marc_values macro" do
|
13
|
+
before do
|
14
|
+
@indexer = Traject::Indexer.new
|
15
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
16
|
+
end
|
17
|
+
|
9
18
|
|
10
19
|
it "is fine with no arguments" do
|
11
20
|
assert(extract_all_marc_values)
|
@@ -20,4 +29,22 @@ describe "The extract_all_marc_values macro" do
|
|
20
29
|
extract_all_marc_values(from: 100, to: '999')
|
21
30
|
end
|
22
31
|
end
|
32
|
+
|
33
|
+
it "#extract_all_marc_values" do
|
34
|
+
@indexer.instance_eval do
|
35
|
+
to_field "text", extract_all_marc_values
|
36
|
+
end
|
37
|
+
output = @indexer.map_record(@record)
|
38
|
+
|
39
|
+
assert_length 13, output["text"]
|
40
|
+
end
|
41
|
+
|
42
|
+
|
23
43
|
end
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
|
+
require 'traject/macros/marc21'
|
5
|
+
|
6
|
+
require 'json'
|
7
|
+
require 'marc'
|
8
|
+
|
9
|
+
|
10
|
+
include Traject::Macros::Marc21
|
11
|
+
|
12
|
+
describe "extract_marc" do
|
13
|
+
before do
|
14
|
+
@indexer = Traject::Indexer.new
|
15
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
it "extracts marc" do
|
20
|
+
@indexer.instance_eval do
|
21
|
+
to_field "title", extract_marc("245ab")
|
22
|
+
end
|
23
|
+
|
24
|
+
output = @indexer.map_record(@record)
|
25
|
+
|
26
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
|
27
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
it "respects :first=>true option" do
|
32
|
+
@indexer.instance_eval do
|
33
|
+
to_field "other_id", extract_marc("035a", :first => true)
|
34
|
+
end
|
35
|
+
|
36
|
+
output = @indexer.map_record(@record)
|
37
|
+
|
38
|
+
assert_length 1, output["other_id"]
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
it "trims punctuation with :trim_punctuation => true" do
|
43
|
+
@indexer.instance_eval do
|
44
|
+
to_field "title", extract_marc("245ab", :trim_punctuation => true)
|
45
|
+
end
|
46
|
+
|
47
|
+
output = @indexer.map_record(@record)
|
48
|
+
|
49
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
|
50
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
it "respects :default option" do
|
55
|
+
@indexer.instance_eval do
|
56
|
+
to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
|
57
|
+
end
|
58
|
+
output = @indexer.map_record(@record)
|
59
|
+
|
60
|
+
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "de-duplicates by default, respects :allow_duplicates" do
|
64
|
+
# Add a second 008
|
65
|
+
f = @record.fields('008').first
|
66
|
+
@record.append(f)
|
67
|
+
|
68
|
+
@indexer.instance_eval do
|
69
|
+
to_field "lang1", extract_marc('008[35-37]')
|
70
|
+
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
|
71
|
+
end
|
72
|
+
|
73
|
+
output = @indexer.map_record(@record)
|
74
|
+
assert_equal ["eng"], output['lang1']
|
75
|
+
assert_equal ["eng", "eng"], output['lang2']
|
76
|
+
assert_equal({}, @indexer.map_record(empty_record))
|
77
|
+
end
|
78
|
+
|
79
|
+
it "fails on an extra/misspelled argument to extract_marc" do
|
80
|
+
assert_raises(RuntimeError) do
|
81
|
+
@indexer.instance_eval do
|
82
|
+
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
it "throws away nil values unless settings['allow_nil_values]'" do
|
89
|
+
@indexer.instance_eval do
|
90
|
+
to_field 'default_nil', extract_marc('9999', :default => nil)
|
91
|
+
end
|
92
|
+
output = @indexer.map_record(@record)
|
93
|
+
assert_nil output['default_nil']
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
it "allows nil values if settings['allow_nil_values]'" do
|
98
|
+
@indexer.settings do |s|
|
99
|
+
s['allow_nil_values'] = true
|
100
|
+
end
|
101
|
+
@indexer.instance_eval do
|
102
|
+
to_field 'default_nil', extract_marc('9999', :default => nil)
|
103
|
+
end
|
104
|
+
output = @indexer.map_record(@record)
|
105
|
+
assert_equal [nil], output['default_nil']
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
it "uses :translation_map" do
|
112
|
+
@indexer.instance_eval do
|
113
|
+
to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
|
114
|
+
end
|
115
|
+
output = @indexer.map_record(@record)
|
116
|
+
|
117
|
+
assert_equal ["Library of Congress"], output["cataloging_agency"]
|
118
|
+
end
|
119
|
+
|
120
|
+
it "supports #extract_marc_from module method" do
|
121
|
+
output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
|
122
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
require 'traject/indexer'
|
4
|
+
require 'traject/macros/marc21'
|
5
|
+
require 'json'
|
6
|
+
require 'marc'
|
7
|
+
|
8
|
+
include Traject::Macros::Marc21
|
9
|
+
|
10
|
+
|
11
|
+
describe "serialized_marc" do
|
12
|
+
before do
|
13
|
+
@indexer = Traject::Indexer.new
|
14
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
15
|
+
end
|
16
|
+
|
17
|
+
it "serializes xml" do
|
18
|
+
@indexer.instance_eval do
|
19
|
+
to_field "marc_record", serialized_marc(:format => "xml")
|
20
|
+
end
|
21
|
+
output = @indexer.map_record(@record)
|
22
|
+
|
23
|
+
assert_length 1, output["marc_record"]
|
24
|
+
assert_kind_of String, output["marc_record"].first
|
25
|
+
roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
|
26
|
+
assert_equal @record, roundtrip_record
|
27
|
+
end
|
28
|
+
|
29
|
+
it "serializes binary UUEncoded" do
|
30
|
+
@indexer.instance_eval do
|
31
|
+
to_field "marc_record", serialized_marc(:format => "binary")
|
32
|
+
end
|
33
|
+
output = @indexer.map_record(@record)
|
34
|
+
|
35
|
+
assert_length 1, output["marc_record"]
|
36
|
+
assert_kind_of String, output["marc_record"].first
|
37
|
+
|
38
|
+
decoded = Base64.decode64(output["marc_record"].first)
|
39
|
+
|
40
|
+
# just check the marc header for now
|
41
|
+
assert_start_with "02067cam a2200469", decoded
|
42
|
+
end
|
43
|
+
|
44
|
+
it "serializes binary raw" do
|
45
|
+
@indexer.instance_eval do
|
46
|
+
to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
|
47
|
+
end
|
48
|
+
output = @indexer.map_record(@record)
|
49
|
+
|
50
|
+
assert_length 1, output["marc_record"]
|
51
|
+
assert_kind_of String, output["marc_record"].first
|
52
|
+
|
53
|
+
# just check the marc header for now
|
54
|
+
assert_start_with "02067cam a2200469", output["marc_record"].first
|
55
|
+
end
|
56
|
+
|
57
|
+
it "serializes json" do
|
58
|
+
@indexer.instance_eval do
|
59
|
+
to_field "marc_record", serialized_marc(:format => "json")
|
60
|
+
end
|
61
|
+
output = @indexer.map_record(@record)
|
62
|
+
|
63
|
+
assert_length 1, output["marc_record"]
|
64
|
+
|
65
|
+
# okay, let's actually deserialize it, why not
|
66
|
+
|
67
|
+
hash = JSON.parse(output["marc_record"].first)
|
68
|
+
|
69
|
+
deserialized = MARC::Record.new_from_hash(hash)
|
70
|
+
|
71
|
+
assert_equal @record, deserialized
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
require 'traject/indexer'
|
5
|
+
require 'traject/macros/marc21'
|
6
|
+
|
7
|
+
|
8
|
+
include Traject::Macros::Marc21
|
9
|
+
|
10
|
+
describe "trim_punctuation" do
|
11
|
+
|
12
|
+
# TODO: test coverage for trim_punctuation
|
13
|
+
# trim_punctuation isn't super-complicated code, and yet we've found a few bugs
|
14
|
+
# in it already. Needs more test coveragel
|
15
|
+
it "Works as expected" do
|
16
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three")
|
17
|
+
|
18
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three,")
|
19
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three/")
|
20
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three;")
|
21
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three:")
|
22
|
+
assert_equal "one two three .", Traject::Macros::Marc21.trim_punctuation("one two three .")
|
23
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three.")
|
24
|
+
assert_equal "one two three...", Traject::Macros::Marc21.trim_punctuation("one two three...")
|
25
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation(" one two three.")
|
26
|
+
|
27
|
+
assert_equal "one two [three]", Traject::Macros::Marc21.trim_punctuation("one two [three]")
|
28
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three]")
|
29
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three")
|
30
|
+
assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three]")
|
31
|
+
|
32
|
+
# This one was a bug before
|
33
|
+
assert_equal "Feminism and art", Traject::Macros::Marc21.trim_punctuation("Feminism and art.")
|
34
|
+
assert_equal "Le réve", Traject::Macros::Marc21.trim_punctuation("Le réve.")
|
35
|
+
|
36
|
+
# This one was a bug on the bug
|
37
|
+
assert_equal "Bill Dueber, Jr.", Traject::Macros::Marc21.trim_punctuation("Bill Dueber, Jr.")
|
38
|
+
end
|
39
|
+
end
|
@@ -192,7 +192,7 @@ describe "Traject::Indexer#map_record" do
|
|
192
192
|
end
|
193
193
|
|
194
194
|
@indexer.to_field('radical') do |rec, acc, context|
|
195
|
-
context.skip!("Chomsky!") if rec['245'].to_s
|
195
|
+
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
196
|
end
|
197
197
|
|
198
198
|
@indexer.to_field('afterSkip') do |rec, acc|
|
data/test/indexer/writer_test.rb
CHANGED
@@ -2,21 +2,28 @@ require 'test_helper'
|
|
2
2
|
require 'traject/yaml_writer'
|
3
3
|
|
4
4
|
describe "The writer on Traject::Indexer" do
|
5
|
-
let(:indexer) { Traject::Indexer.new("solr.url" => "http://
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
let(:indexer) { Traject::Indexer.new("solr.url" => "http://localhost.com") }
|
6
|
+
|
7
|
+
# TODO: fix default writer test
|
8
|
+
# Fails in the absence of a configured
|
9
|
+
# network interface.
|
10
|
+
describe "default writer from index" do
|
11
|
+
it "has a default" do
|
12
|
+
# assert_instance_of Traject::SolrJsonWriter, indexer.writer
|
13
|
+
# assert_equal Traject::SolrJsonWriter, indexer.writer_class
|
14
|
+
skip "Fails in the absence of a configured network interface."
|
15
|
+
end
|
10
16
|
end
|
11
17
|
|
12
|
-
|
18
|
+
|
19
|
+
describe "when the writer is set in config" do
|
13
20
|
let(:writer) { Traject::YamlWriter.new({}) }
|
14
21
|
|
15
22
|
let(:indexer) { Traject::Indexer.new(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
"solr.url" => "http://example.com",
|
24
|
+
"writer_class" => 'Traject::SolrJsonWriter',
|
25
|
+
"writer" => writer
|
26
|
+
) }
|
20
27
|
|
21
28
|
it "uses writer from config" do
|
22
29
|
assert_equal writer, indexer.writer
|
@@ -120,7 +120,9 @@ to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |re
|
|
120
120
|
if call_type == "sudoc"
|
121
121
|
# we choose to call it:
|
122
122
|
accumulator << "Government Publication"
|
123
|
-
elsif call_type.nil? ||
|
123
|
+
elsif call_type.nil? ||
|
124
|
+
call_type == "lc" ||
|
125
|
+
Traject::Macros::Marc21Semantics::LCC_REGEX.match(field['a'])
|
124
126
|
# run it through the map
|
125
127
|
s = field['a']
|
126
128
|
s = s.slice(0, 1) if s
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.3
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-01-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -268,15 +268,17 @@ files:
|
|
268
268
|
- test/indexer/context_test.rb
|
269
269
|
- test/indexer/each_record_test.rb
|
270
270
|
- test/indexer/load_config_file_test.rb
|
271
|
-
- test/indexer/macros_marc21_semantics_test.rb
|
272
|
-
- test/indexer/
|
273
|
-
- test/indexer/
|
271
|
+
- test/indexer/macros/macros_marc21_semantics_test.rb
|
272
|
+
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
273
|
+
- test/indexer/macros/marc21/extract_marc_test.rb
|
274
|
+
- test/indexer/macros/marc21/serialize_marc_test.rb
|
275
|
+
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
276
|
+
- test/indexer/macros/to_field_test.rb
|
274
277
|
- test/indexer/map_record_test.rb
|
275
278
|
- test/indexer/read_write_test.rb
|
276
279
|
- test/indexer/settings_test.rb
|
277
280
|
- test/indexer/to_field_test.rb
|
278
281
|
- test/indexer/writer_test.rb
|
279
|
-
- test/marc21_macros_test.rb
|
280
282
|
- test/marc_extractor_test.rb
|
281
283
|
- test/marc_format_classifier_test.rb
|
282
284
|
- test/marc_reader_test.rb
|
@@ -343,7 +345,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
343
345
|
version: '0'
|
344
346
|
requirements: []
|
345
347
|
rubyforge_project:
|
346
|
-
rubygems_version: 2.6.
|
348
|
+
rubygems_version: 2.6.8
|
347
349
|
signing_key:
|
348
350
|
specification_version: 4
|
349
351
|
summary: Index MARC to Solr; or generally process source records to hash-like structures
|
@@ -353,15 +355,17 @@ test_files:
|
|
353
355
|
- test/indexer/context_test.rb
|
354
356
|
- test/indexer/each_record_test.rb
|
355
357
|
- test/indexer/load_config_file_test.rb
|
356
|
-
- test/indexer/macros_marc21_semantics_test.rb
|
357
|
-
- test/indexer/
|
358
|
-
- test/indexer/
|
358
|
+
- test/indexer/macros/macros_marc21_semantics_test.rb
|
359
|
+
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
360
|
+
- test/indexer/macros/marc21/extract_marc_test.rb
|
361
|
+
- test/indexer/macros/marc21/serialize_marc_test.rb
|
362
|
+
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
363
|
+
- test/indexer/macros/to_field_test.rb
|
359
364
|
- test/indexer/map_record_test.rb
|
360
365
|
- test/indexer/read_write_test.rb
|
361
366
|
- test/indexer/settings_test.rb
|
362
367
|
- test/indexer/to_field_test.rb
|
363
368
|
- test/indexer/writer_test.rb
|
364
|
-
- test/marc21_macros_test.rb
|
365
369
|
- test/marc_extractor_test.rb
|
366
370
|
- test/marc_format_classifier_test.rb
|
367
371
|
- test/marc_reader_test.rb
|
@@ -1,219 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
|
3
|
-
require 'traject/indexer'
|
4
|
-
require 'traject/macros/marc21'
|
5
|
-
|
6
|
-
require 'json'
|
7
|
-
require 'marc'
|
8
|
-
|
9
|
-
# See also marc_extractor_test.rb for more detailed tests on marc extraction,
|
10
|
-
# this is just a basic test to make sure our macro works passing through to there
|
11
|
-
# and other options.
|
12
|
-
describe "Traject::Macros::Marc21" do
|
13
|
-
Marc21 = Traject::Macros::Marc21 # shortcut
|
14
|
-
|
15
|
-
before do
|
16
|
-
@indexer = Traject::Indexer.new
|
17
|
-
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
18
|
-
end
|
19
|
-
|
20
|
-
describe "extract_marc" do
|
21
|
-
it "extracts marc" do
|
22
|
-
@indexer.instance_eval do
|
23
|
-
to_field "title", extract_marc("245ab")
|
24
|
-
end
|
25
|
-
|
26
|
-
output = @indexer.map_record(@record)
|
27
|
-
|
28
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
|
29
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
it "respects :first=>true option" do
|
34
|
-
@indexer.instance_eval do
|
35
|
-
to_field "other_id", extract_marc("035a", :first => true)
|
36
|
-
end
|
37
|
-
|
38
|
-
output = @indexer.map_record(@record)
|
39
|
-
|
40
|
-
assert_length 1, output["other_id"]
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
it "trims punctuation with :trim_punctuation => true" do
|
45
|
-
@indexer.instance_eval do
|
46
|
-
to_field "title", extract_marc("245ab", :trim_punctuation => true)
|
47
|
-
end
|
48
|
-
|
49
|
-
output = @indexer.map_record(@record)
|
50
|
-
|
51
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
|
52
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
|
-
it "respects :default option" do
|
57
|
-
@indexer.instance_eval do
|
58
|
-
to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
|
59
|
-
end
|
60
|
-
output = @indexer.map_record(@record)
|
61
|
-
|
62
|
-
assert_equal ["DEFAULT VALUE"], output["only_default"]
|
63
|
-
end
|
64
|
-
|
65
|
-
it "de-duplicates by default, respects :allow_duplicates" do
|
66
|
-
# Add a second 008
|
67
|
-
f = @record.fields('008').first
|
68
|
-
@record.append(f)
|
69
|
-
|
70
|
-
@indexer.instance_eval do
|
71
|
-
to_field "lang1", extract_marc('008[35-37]')
|
72
|
-
to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
|
73
|
-
end
|
74
|
-
|
75
|
-
output = @indexer.map_record(@record)
|
76
|
-
assert_equal ["eng"], output['lang1']
|
77
|
-
assert_equal ["eng", "eng"], output['lang2']
|
78
|
-
assert_equal({}, @indexer.map_record(empty_record))
|
79
|
-
end
|
80
|
-
|
81
|
-
it "fails on an extra/misspelled argument to extract_marc" do
|
82
|
-
assert_raises(RuntimeError) do
|
83
|
-
@indexer.instance_eval do
|
84
|
-
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
it "throws away nil values unless settings['allow_nil_values]'" do
|
91
|
-
@indexer.instance_eval do
|
92
|
-
to_field 'default_nil', extract_marc('9999', :default => nil)
|
93
|
-
end
|
94
|
-
output = @indexer.map_record(@record)
|
95
|
-
assert_nil output['default_nil']
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
it "allows nil values if settings['allow_nil_values]'" do
|
101
|
-
@indexer.settings do |s|
|
102
|
-
s['allow_nil_values'] = true
|
103
|
-
end
|
104
|
-
@indexer.instance_eval do
|
105
|
-
to_field 'default_nil', extract_marc('9999', :default => nil)
|
106
|
-
end
|
107
|
-
output = @indexer.map_record(@record)
|
108
|
-
assert_equal [nil], output['default_nil']
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
|
-
it "Marc21::trim_punctuation class method" do
|
113
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three")
|
114
|
-
|
115
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three,")
|
116
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three/")
|
117
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three;")
|
118
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three:")
|
119
|
-
assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
|
120
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three.")
|
121
|
-
assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
|
122
|
-
assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
|
123
|
-
|
124
|
-
assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
|
125
|
-
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
126
|
-
assert_equal "one two three", Marc21.trim_punctuation("[one two three")
|
127
|
-
assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
|
128
|
-
|
129
|
-
# This one was a bug before
|
130
|
-
assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
|
131
|
-
|
132
|
-
assert_equal "Le réve", Marc21.trim_punctuation("Le réve.") # this assertion currently fails
|
133
|
-
end
|
134
|
-
|
135
|
-
it "uses :translation_map" do
|
136
|
-
@indexer.instance_eval do
|
137
|
-
to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
|
138
|
-
end
|
139
|
-
output = @indexer.map_record(@record)
|
140
|
-
|
141
|
-
assert_equal ["Library of Congress"], output["cataloging_agency"]
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
it "supports #extract_marc_from module method" do
|
146
|
-
output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
|
147
|
-
assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
|
148
|
-
end
|
149
|
-
|
150
|
-
describe "serialized_marc" do
|
151
|
-
it "serializes xml" do
|
152
|
-
@indexer.instance_eval do
|
153
|
-
to_field "marc_record", serialized_marc(:format => "xml")
|
154
|
-
end
|
155
|
-
output = @indexer.map_record(@record)
|
156
|
-
|
157
|
-
assert_length 1, output["marc_record"]
|
158
|
-
assert_kind_of String, output["marc_record"].first
|
159
|
-
roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
|
160
|
-
assert_equal @record, roundtrip_record
|
161
|
-
end
|
162
|
-
|
163
|
-
it "serializes binary UUEncoded" do
|
164
|
-
@indexer.instance_eval do
|
165
|
-
to_field "marc_record", serialized_marc(:format => "binary")
|
166
|
-
end
|
167
|
-
output = @indexer.map_record(@record)
|
168
|
-
|
169
|
-
assert_length 1, output["marc_record"]
|
170
|
-
assert_kind_of String, output["marc_record"].first
|
171
|
-
|
172
|
-
decoded = Base64.decode64(output["marc_record"].first)
|
173
|
-
|
174
|
-
# just check the marc header for now
|
175
|
-
assert_start_with "02067cam a2200469", decoded
|
176
|
-
end
|
177
|
-
|
178
|
-
it "serializes binary raw" do
|
179
|
-
@indexer.instance_eval do
|
180
|
-
to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
|
181
|
-
end
|
182
|
-
output = @indexer.map_record(@record)
|
183
|
-
|
184
|
-
assert_length 1, output["marc_record"]
|
185
|
-
assert_kind_of String, output["marc_record"].first
|
186
|
-
|
187
|
-
# just check the marc header for now
|
188
|
-
assert_start_with "02067cam a2200469", output["marc_record"].first
|
189
|
-
end
|
190
|
-
|
191
|
-
it "serializes json" do
|
192
|
-
@indexer.instance_eval do
|
193
|
-
to_field "marc_record", serialized_marc(:format => "json")
|
194
|
-
end
|
195
|
-
output = @indexer.map_record(@record)
|
196
|
-
|
197
|
-
assert_length 1, output["marc_record"]
|
198
|
-
|
199
|
-
# okay, let's actually deserialize it, why not
|
200
|
-
|
201
|
-
hash = JSON.parse(output["marc_record"].first)
|
202
|
-
|
203
|
-
deserialized = MARC::Record.new_from_hash(hash)
|
204
|
-
|
205
|
-
assert_equal @record, deserialized
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
it "#extract_all_marc_values" do
|
210
|
-
@indexer.instance_eval do
|
211
|
-
to_field "text", extract_all_marc_values
|
212
|
-
end
|
213
|
-
output = @indexer.map_record(@record)
|
214
|
-
|
215
|
-
assert_length 13, output["text"]
|
216
|
-
end
|
217
|
-
|
218
|
-
|
219
|
-
end
|