traject 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +10 -0
- data/lib/traject/indexer.rb +12 -4
- data/lib/traject/macros/marc21_semantics.rb +15 -12
- data/lib/traject/oai_pmh_nokogiri_reader.rb +9 -3
- data/lib/traject/version.rb +1 -1
- data/test/delimited_writer_test.rb +14 -16
- data/test/indexer/class_level_configuration_test.rb +23 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +4 -0
- data/test/test_support/date_resort_to_264.marc +1 -0
- data/traject.gemspec +2 -2
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1700077d5c2d3c667fc9520b659c3ca986b8ab34aee233f62bd7f73fdef91977
|
4
|
+
data.tar.gz: 736b217f209ed08faba9c1d20c006b29586aa3ebdf088a89e37f5f3b7400de06
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21877d6cd5b03f7ffbbac316a6d58a3bc65b534cb7457e57d39ba470ad49d99c8677e5e6ede25c650bba5ac3f0b22f9b348ebabb36ac4047433eb8a76379ef1d
|
7
|
+
data.tar.gz: 4ec1938d2d7b60a61ebde4e9c4e763e511c2896788b56b38ad6f22615dffb57449e29c9ef40e261952e125b90f5fe491fa447b077c7dcb1c55f57d6ef603fd5b
|
data/CHANGES.md
CHANGED
@@ -8,6 +8,16 @@
|
|
8
8
|
|
9
9
|
*
|
10
10
|
|
11
|
+
## 3.3.0
|
12
|
+
|
13
|
+
* `Traject::Macros::Marc21Semantics.publication_date` now gets date from 264 before 260. https://github.com/traject/traject/pull/233
|
14
|
+
|
15
|
+
* Allow hashie 4.x in gemspec https://github.com/traject/traject/pull/234
|
16
|
+
|
17
|
+
* Allow `http` gem 4.x versions. https://github.com/traject/traject/pull/236
|
18
|
+
|
19
|
+
* Can now call class-level Indexer.configure multiple times https://github.com/sciencehistory/scihist_digicoll/pull/525
|
20
|
+
|
11
21
|
## 3.2.0
|
12
22
|
|
13
23
|
* NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
|
data/lib/traject/indexer.rb
CHANGED
@@ -190,7 +190,7 @@ class Traject::Indexer
|
|
190
190
|
instance_eval(&block)
|
191
191
|
end
|
192
192
|
|
193
|
-
## Class level configure block accepted too, and applied at instantiation
|
193
|
+
## Class level configure block(s) accepted too, and applied at instantiation
|
194
194
|
# before instance-level configuration.
|
195
195
|
#
|
196
196
|
# EXPERIMENTAL, implementation may change in ways that effect some uses.
|
@@ -199,8 +199,14 @@ class Traject::Indexer
|
|
199
199
|
# Note that settings set by 'provide' in subclass can not really be overridden
|
200
200
|
# by 'provide' in a next level subclass. Use self.default_settings instead, with
|
201
201
|
# call to super.
|
202
|
+
#
|
203
|
+
# You can call this .configure multiple times, blocks are added to a list, and
|
204
|
+
# will be used to initialize an instance in order.
|
205
|
+
#
|
206
|
+
# The main downside of this workaround implementation is performance, even though
|
207
|
+
# defined at load-time on class level, blocks are all executed on every instantiation.
|
202
208
|
def self.configure(&block)
|
203
|
-
@
|
209
|
+
(@class_configure_blocks ||= []) << block
|
204
210
|
end
|
205
211
|
|
206
212
|
def self.apply_class_configure_block(instance)
|
@@ -208,8 +214,10 @@ class Traject::Indexer
|
|
208
214
|
if self.superclass.respond_to?(:apply_class_configure_block)
|
209
215
|
self.superclass.apply_class_configure_block(instance)
|
210
216
|
end
|
211
|
-
if @
|
212
|
-
|
217
|
+
if @class_configure_blocks && !@class_configure_blocks.empty?
|
218
|
+
@class_configure_blocks.each do |block|
|
219
|
+
instance.configure(&block)
|
220
|
+
end
|
213
221
|
end
|
214
222
|
end
|
215
223
|
|
@@ -26,19 +26,19 @@ module Traject::Macros
|
|
26
26
|
accumulator.concat list.uniq if list
|
27
27
|
end
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
# If a num begins with a known OCLC prefix, return it without the prefix.
|
31
31
|
# otherwise nil.
|
32
32
|
#
|
33
|
-
# Allow (OCoLC) and/or ocn/ocm/on
|
34
|
-
|
33
|
+
# Allow (OCoLC) and/or ocn/ocm/on
|
34
|
+
|
35
35
|
OCLCPAT = /
|
36
36
|
\A\s*
|
37
37
|
(?:(?:\(OCoLC\)) |
|
38
38
|
(?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on))
|
39
39
|
)(\d+)
|
40
40
|
/x
|
41
|
-
|
41
|
+
|
42
42
|
def self.oclcnum_extract(num)
|
43
43
|
if m = OCLCPAT.match(num)
|
44
44
|
return m[1]
|
@@ -364,13 +364,16 @@ module Traject::Macros
|
|
364
364
|
end
|
365
365
|
end
|
366
366
|
end
|
367
|
-
# Okay, nothing from 008, try 260
|
367
|
+
# Okay, nothing from 008, first try 264, then try 260
|
368
368
|
if found_date.nil?
|
369
|
+
v264c = MarcExtractor.cached("264c", :separator => nil).extract(record).first
|
369
370
|
v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
|
370
371
|
# just try to take the first four digits out of there, we're not going to try
|
371
372
|
# anything crazy.
|
372
|
-
if m = /(\d{4})/.match(
|
373
|
+
if m = /(\d{4})/.match(v264c)
|
373
374
|
found_date = m[1].to_i
|
375
|
+
elsif m = /(\d{4})/.match(v260c)
|
376
|
+
found_date = m[1].to_i
|
374
377
|
end
|
375
378
|
end
|
376
379
|
|
@@ -519,11 +522,11 @@ module Traject::Macros
|
|
519
522
|
|
520
523
|
# Extracts LCSH-carrying fields, and formatting them
|
521
524
|
# as a pre-coordinated LCSH string, for instance suitable for including
|
522
|
-
# in a facet.
|
525
|
+
# in a facet.
|
523
526
|
#
|
524
527
|
# You can supply your own list of fields as a spec, but for significant
|
525
528
|
# customization you probably just want to write your own method in
|
526
|
-
# terms of the Marc21Semantics.assemble_lcsh method.
|
529
|
+
# terms of the Marc21Semantics.assemble_lcsh method.
|
527
530
|
def marc_lcsh_formatted(options = {})
|
528
531
|
spec = options[:spec] || "600:610:611:630:648:650:651:654:662"
|
529
532
|
subd_separator = options[:subdivison_separator] || " — "
|
@@ -540,17 +543,17 @@ module Traject::Macros
|
|
540
543
|
end
|
541
544
|
|
542
545
|
# Takes a MARC::Field and formats it into a pre-coordinated LCSH string
|
543
|
-
# with subdivision seperators in the right place.
|
546
|
+
# with subdivision seperators in the right place.
|
544
547
|
#
|
545
548
|
# For 600 fields especially, need to not just join with subdivision seperator
|
546
549
|
# to take acount of $a$d$t -- for other fields, might be able to just
|
547
|
-
# join subfields, not sure.
|
550
|
+
# join subfields, not sure.
|
548
551
|
#
|
549
552
|
# WILL strip trailing period from generated string, contrary to some LCSH practice.
|
550
553
|
# Our data is inconsistent on whether it has period or not, this was
|
551
|
-
# the easiest way to standardize.
|
554
|
+
# the easiest way to standardize.
|
552
555
|
#
|
553
|
-
# Default subdivision seperator is em-dash with spaces, set to '--' if you want.
|
556
|
+
# Default subdivision seperator is em-dash with spaces, set to '--' if you want.
|
554
557
|
#
|
555
558
|
# Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
|
556
559
|
# is not carried in the MARC record. It may be system generated as a display constant
|
@@ -115,9 +115,15 @@ module Traject
|
|
115
115
|
# @returns [HTTP::Client] from http.rb gem
|
116
116
|
def http_client
|
117
117
|
@http_client ||= begin
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
client = nil
|
119
|
+
|
120
|
+
if HTTP::VERSION.split(".").first.to_i > 3
|
121
|
+
client = HTTP.timeout(timeout)
|
122
|
+
else
|
123
|
+
# timeout setting on http.rb 3.x are a bit of a mess.
|
124
|
+
# https://github.com/httprb/http/issues/488
|
125
|
+
client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
|
126
|
+
end
|
121
127
|
|
122
128
|
if settings["oai_pmh.try_gzip"]
|
123
129
|
client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
|
data/lib/traject/version.rb
CHANGED
@@ -24,40 +24,40 @@ describe "Delimited/CSV Writers" do
|
|
24
24
|
|
25
25
|
it "creates a dw with defaults" do
|
26
26
|
dw = Traject::DelimitedWriter.new(@settings)
|
27
|
-
dw.delimiter
|
28
|
-
dw.internal_delimiter
|
29
|
-
dw.edelim
|
30
|
-
dw.eidelim
|
27
|
+
assert_equal dw.delimiter, "\t"
|
28
|
+
assert_equal dw.internal_delimiter, '|'
|
29
|
+
assert_equal dw.edelim, ' '
|
30
|
+
assert_equal dw.eidelim, '\\|'
|
31
31
|
end
|
32
32
|
|
33
33
|
it "respects different delimiter" do
|
34
34
|
@settings['delimited_writer.delimiter'] = '^'
|
35
35
|
dw = Traject::DelimitedWriter.new(@settings)
|
36
|
-
dw.delimiter
|
37
|
-
dw.edelim
|
38
|
-
dw.internal_delimiter
|
36
|
+
assert_equal dw.delimiter, '^'
|
37
|
+
assert_equal dw.edelim, '\\^'
|
38
|
+
assert_equal dw.internal_delimiter, '|'
|
39
39
|
end
|
40
40
|
|
41
41
|
it "outputs a header if asked to" do
|
42
42
|
Traject::DelimitedWriter.new(@settings)
|
43
|
-
@out.string.chomp
|
43
|
+
assert_equal @out.string.chomp, %w[four one two].join("\t")
|
44
44
|
end
|
45
45
|
|
46
46
|
it "doesn't output a header if asked not to" do
|
47
47
|
@settings['delimited_writer.header'] = 'false'
|
48
48
|
Traject::DelimitedWriter.new(@settings)
|
49
|
-
@out.string
|
49
|
+
assert_empty @out.string
|
50
50
|
end
|
51
51
|
|
52
52
|
it "deals with multiple values" do
|
53
53
|
dw = Traject::DelimitedWriter.new(@settings)
|
54
54
|
dw.put @context
|
55
|
-
@out.string.split("\n").last
|
55
|
+
assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(dw.delimiter)
|
56
56
|
end
|
57
57
|
|
58
58
|
it "bails if delimited_writer.fields isn't set" do
|
59
59
|
@settings.delete 'delimited_writer.fields'
|
60
|
-
|
60
|
+
assert_raises(ArgumentError) { Traject::DelimitedWriter.new(@settings) }
|
61
61
|
end
|
62
62
|
|
63
63
|
end
|
@@ -65,18 +65,18 @@ describe "Delimited/CSV Writers" do
|
|
65
65
|
describe "Traject::CSVWriter" do
|
66
66
|
it "unsets the delimiter" do
|
67
67
|
cw = Traject::CSVWriter.new(@settings)
|
68
|
-
cw.delimiter
|
68
|
+
assert_nil cw.delimiter
|
69
69
|
end
|
70
70
|
|
71
71
|
it "writes the header" do
|
72
72
|
Traject::CSVWriter.new(@settings)
|
73
|
-
@out.string.chomp
|
73
|
+
assert_equal @out.string.chomp, 'four,one,two'
|
74
74
|
end
|
75
75
|
|
76
76
|
it "uses the internal delimiter" do
|
77
77
|
cw = Traject::CSVWriter.new(@settings)
|
78
78
|
cw.put @context
|
79
|
-
@out.string.split("\n").last
|
79
|
+
assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(',')
|
80
80
|
end
|
81
81
|
|
82
82
|
it "produces complex output" do
|
@@ -97,8 +97,6 @@ describe "Delimited/CSV Writers" do
|
|
97
97
|
traject_csvwriter_output = @out.string.split("\n").last.chomp
|
98
98
|
|
99
99
|
assert_equal(csv_output, traject_csvwriter_output)
|
100
|
-
|
101
100
|
end
|
102
|
-
|
103
101
|
end
|
104
102
|
end
|
@@ -25,6 +25,7 @@ describe "Class-level configuration of Indexer sub-class" do
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
+
|
28
29
|
before do
|
29
30
|
@indexer = TestIndexerSubclass.new
|
30
31
|
end
|
@@ -47,6 +48,28 @@ describe "Class-level configuration of Indexer sub-class" do
|
|
47
48
|
assert_equal ['from-instance-config'], result["instance_field"]
|
48
49
|
end
|
49
50
|
|
51
|
+
describe "multiple class-level configure" do
|
52
|
+
class MultipleConfigureIndexer < Traject::Indexer
|
53
|
+
configure do
|
54
|
+
to_field "field", literal("value")
|
55
|
+
end
|
56
|
+
configure do
|
57
|
+
to_field "field", literal("value from second configure")
|
58
|
+
to_field "second_call", literal("value from second configure")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
before do
|
63
|
+
@indexer = MultipleConfigureIndexer.new
|
64
|
+
end
|
65
|
+
|
66
|
+
it "lets you call class-level configure multiple times and aggregates" do
|
67
|
+
result = @indexer.map_record(Object.new)
|
68
|
+
assert_equal ['value', 'value from second configure'], result['field']
|
69
|
+
assert_equal ['value from second configure'], result['second_call']
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
50
73
|
describe "with multi-level subclass" do
|
51
74
|
class TestIndexerSubclassSubclass < TestIndexerSubclass
|
52
75
|
configure do
|
@@ -197,6 +197,10 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
197
197
|
# we take the first date. And need to deal with the u.
|
198
198
|
assert_equal 1845, Marc21Semantics.publication_date(@record)
|
199
199
|
end
|
200
|
+
it "resorts to 264c" do
|
201
|
+
@record = MARC::Reader.new(support_file_path "date_resort_to_264.marc").to_a.first
|
202
|
+
assert_equal 2015, Marc21Semantics.publication_date(@record)
|
203
|
+
end
|
200
204
|
it "resorts to 260c" do
|
201
205
|
@record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
|
202
206
|
assert_equal 1980, Marc21Semantics.publication_date(@record)
|
@@ -0,0 +1 @@
|
|
1
|
+
01180aam a2200337 a 4500001001000000008004100010015001900051016001800070020002500088020002200113040005900135043001200194050002400206082001400230100003500244245006400279260003800343264003800381264001100419300003600430336002100466336002800487337002500515338002300540504006700563651004000630651005000670651004300720651005600763035002300819a11417842130723t20uu20uuenkb b 001 0 eng d aGBB3854302bnb7 a0164999372Uk a9781849043427 (pbk.) a1849043426 (pbk.) aUKMGBcUKMGBdOCLCOdYDXCPdOCLCOdZWZdOCLCOdCaONFJC aa-ii--- 4aDS485.K25bS64 201504a954.62231 aSnedden, Christopher,eauthor.10aUnderstanding Kashmir and Kashmiris /cChristopher Snedden. 1aLondon :bHurst & Company,c2014. 1aLondon :bHurst & Company,c2015. 4c©2015 axix, 372 pages :bmaps ;c22 cm atext2rdacontent astill image2rdacontent aunmediated2rdamedia avolume2rdacarrier aIncludes bibliographical references (pages 331-355) and index. 0aJammu and Kashmir (India)xHistory. 0aJammu and Kashmir (India)xForeign relations. 0aJammu and Kashmir (India)vBoundaries. 0aJammu and Kashmir (India)xPolitics and government. a(OCoLC-M)858826393
|
data/traject.gemspec
CHANGED
@@ -24,12 +24,12 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency "concurrent-ruby", ">= 0.8.0"
|
25
25
|
spec.add_dependency "marc", "~> 1.0"
|
26
26
|
|
27
|
-
spec.add_dependency "hashie", "
|
27
|
+
spec.add_dependency "hashie", ">= 3.1", "< 5" # used for Indexer#settings
|
28
28
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
29
29
|
spec.add_dependency "yell" # logging
|
30
30
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
31
31
|
spec.add_dependency "httpclient", "~> 2.5"
|
32
|
-
spec.add_dependency "http", "
|
32
|
+
spec.add_dependency "http", ">= 3.0", "< 5" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
|
33
33
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
34
34
|
spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
|
35
35
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-12-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -43,16 +43,22 @@ dependencies:
|
|
43
43
|
name: hashie
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
|
-
- - "
|
46
|
+
- - ">="
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '3.1'
|
49
|
+
- - "<"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '5'
|
49
52
|
type: :runtime
|
50
53
|
prerelease: false
|
51
54
|
version_requirements: !ruby/object:Gem::Requirement
|
52
55
|
requirements:
|
53
|
-
- - "
|
56
|
+
- - ">="
|
54
57
|
- !ruby/object:Gem::Version
|
55
58
|
version: '3.1'
|
59
|
+
- - "<"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5'
|
56
62
|
- !ruby/object:Gem::Dependency
|
57
63
|
name: slop
|
58
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -119,16 +125,22 @@ dependencies:
|
|
119
125
|
name: http
|
120
126
|
requirement: !ruby/object:Gem::Requirement
|
121
127
|
requirements:
|
122
|
-
- - "
|
128
|
+
- - ">="
|
123
129
|
- !ruby/object:Gem::Version
|
124
130
|
version: '3.0'
|
131
|
+
- - "<"
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '5'
|
125
134
|
type: :runtime
|
126
135
|
prerelease: false
|
127
136
|
version_requirements: !ruby/object:Gem::Requirement
|
128
137
|
requirements:
|
129
|
-
- - "
|
138
|
+
- - ">="
|
130
139
|
- !ruby/object:Gem::Version
|
131
140
|
version: '3.0'
|
141
|
+
- - "<"
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '5'
|
132
144
|
- !ruby/object:Gem::Dependency
|
133
145
|
name: marc-fastxmlwriter
|
134
146
|
requirement: !ruby/object:Gem::Requirement
|
@@ -330,6 +342,7 @@ files:
|
|
330
342
|
- test/test_support/bad_subfield_code.marc
|
331
343
|
- test/test_support/bad_utf_byte.utf8.marc
|
332
344
|
- test/test_support/date_resort_to_260.marc
|
345
|
+
- test/test_support/date_resort_to_264.marc
|
333
346
|
- test/test_support/date_type_r_missing_date2.marc
|
334
347
|
- test/test_support/date_with_u.marc
|
335
348
|
- test/test_support/demo_config.rb
|
@@ -434,6 +447,7 @@ test_files:
|
|
434
447
|
- test/test_support/bad_subfield_code.marc
|
435
448
|
- test/test_support/bad_utf_byte.utf8.marc
|
436
449
|
- test/test_support/date_resort_to_260.marc
|
450
|
+
- test/test_support/date_resort_to_264.marc
|
437
451
|
- test/test_support/date_type_r_missing_date2.marc
|
438
452
|
- test/test_support/date_with_u.marc
|
439
453
|
- test/test_support/demo_config.rb
|