traject 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9a69bdb470c759d08d117f910ec1c6b16cbe93dbb3e955b653a07e3c03efec68
4
- data.tar.gz: 68016ddf17fe29348248e5f26ff5a813c1485c37a0ac014faa69803be182f7b4
3
+ metadata.gz: 1700077d5c2d3c667fc9520b659c3ca986b8ab34aee233f62bd7f73fdef91977
4
+ data.tar.gz: 736b217f209ed08faba9c1d20c006b29586aa3ebdf088a89e37f5f3b7400de06
5
5
  SHA512:
6
- metadata.gz: 8921a5d2349025291f28c7d39ebec44c0c0ebdc11a489e1d4c22913c04c840cf40163c576b14432794f1d663b9b81eb1d777b87d38af2b429f4b6ac98982ad48
7
- data.tar.gz: 8bdb95f98baa11ee60c74a7a4bbab06c44fb5da466138d098b05e5ab3fddc28a1fe8926044be5350e5e0b8a91ccd2d6e2e2ecf78424cfc0430464dd0ccf79c66
6
+ metadata.gz: 21877d6cd5b03f7ffbbac316a6d58a3bc65b534cb7457e57d39ba470ad49d99c8677e5e6ede25c650bba5ac3f0b22f9b348ebabb36ac4047433eb8a76379ef1d
7
+ data.tar.gz: 4ec1938d2d7b60a61ebde4e9c4e763e511c2896788b56b38ad6f22615dffb57449e29c9ef40e261952e125b90f5fe491fa447b077c7dcb1c55f57d6ef603fd5b
data/CHANGES.md CHANGED
@@ -8,6 +8,16 @@
8
8
 
9
9
  *
10
10
 
11
+ ## 3.3.0
12
+
13
+ * `Traject::Macros::Marc21Semantics.publication_date` now gets date from 264 before 260. https://github.com/traject/traject/pull/233
14
+
15
+ * Allow hashie 4.x in gemspec https://github.com/traject/traject/pull/234
16
+
17
+ * Allow `http` gem 4.x versions. https://github.com/traject/traject/pull/236
18
+
19
+ * Can now call class-level Indexer.configure multiple times https://github.com/sciencehistory/scihist_digicoll/pull/525
20
+
11
21
  ## 3.2.0
12
22
 
13
23
  * NokogiriReader has a "nokogiri.strict_mode" setting. Set to true or string 'true' to ask Nokogori to parse in strict mode, so it will immediately raise on ill-formed XML, instead of nokogiri's default to do what it can with it. https://github.com/traject/traject/pull/226
@@ -190,7 +190,7 @@ class Traject::Indexer
190
190
  instance_eval(&block)
191
191
  end
192
192
 
193
- ## Class level configure block accepted too, and applied at instantiation
193
+ ## Class level configure block(s) accepted too, and applied at instantiation
194
194
  # before instance-level configuration.
195
195
  #
196
196
  # EXPERIMENTAL, implementation may change in ways that effect some uses.
@@ -199,8 +199,14 @@ class Traject::Indexer
199
199
  # Note that settings set by 'provide' in subclass can not really be overridden
200
200
  # by 'provide' in a next level subclass. Use self.default_settings instead, with
201
201
  # call to super.
202
+ #
203
+ # You can call this .configure multiple times, blocks are added to a list, and
204
+ # will be used to initialize an instance in order.
205
+ #
206
+ # The main downside of this workaround implementation is performance, even though
207
+ # defined at load-time on class level, blocks are all executed on every instantiation.
202
208
  def self.configure(&block)
203
- @class_configure_block = block
209
+ (@class_configure_blocks ||= []) << block
204
210
  end
205
211
 
206
212
  def self.apply_class_configure_block(instance)
@@ -208,8 +214,10 @@ class Traject::Indexer
208
214
  if self.superclass.respond_to?(:apply_class_configure_block)
209
215
  self.superclass.apply_class_configure_block(instance)
210
216
  end
211
- if @class_configure_block
212
- instance.configure(&@class_configure_block)
217
+ if @class_configure_blocks && !@class_configure_blocks.empty?
218
+ @class_configure_blocks.each do |block|
219
+ instance.configure(&block)
220
+ end
213
221
  end
214
222
  end
215
223
 
@@ -26,19 +26,19 @@ module Traject::Macros
26
26
  accumulator.concat list.uniq if list
27
27
  end
28
28
  end
29
-
29
+
30
30
  # If a num begins with a known OCLC prefix, return it without the prefix.
31
31
  # otherwise nil.
32
32
  #
33
- # Allow (OCoLC) and/or ocn/ocm/on
34
-
33
+ # Allow (OCoLC) and/or ocn/ocm/on
34
+
35
35
  OCLCPAT = /
36
36
  \A\s*
37
37
  (?:(?:\(OCoLC\)) |
38
38
  (?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on))
39
39
  )(\d+)
40
40
  /x
41
-
41
+
42
42
  def self.oclcnum_extract(num)
43
43
  if m = OCLCPAT.match(num)
44
44
  return m[1]
@@ -364,13 +364,16 @@ module Traject::Macros
364
364
  end
365
365
  end
366
366
  end
367
- # Okay, nothing from 008, try 260
367
+ # Okay, nothing from 008, first try 264, then try 260
368
368
  if found_date.nil?
369
+ v264c = MarcExtractor.cached("264c", :separator => nil).extract(record).first
369
370
  v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
370
371
  # just try to take the first four digits out of there, we're not going to try
371
372
  # anything crazy.
372
- if m = /(\d{4})/.match(v260c)
373
+ if m = /(\d{4})/.match(v264c)
373
374
  found_date = m[1].to_i
375
+ elsif m = /(\d{4})/.match(v260c)
376
+ found_date = m[1].to_i
374
377
  end
375
378
  end
376
379
 
@@ -519,11 +522,11 @@ module Traject::Macros
519
522
 
520
523
  # Extracts LCSH-carrying fields, and formatting them
521
524
  # as a pre-coordinated LCSH string, for instance suitable for including
522
- # in a facet.
525
+ # in a facet.
523
526
  #
524
527
  # You can supply your own list of fields as a spec, but for significant
525
528
  # customization you probably just want to write your own method in
526
- # terms of the Marc21Semantics.assemble_lcsh method.
529
+ # terms of the Marc21Semantics.assemble_lcsh method.
527
530
  def marc_lcsh_formatted(options = {})
528
531
  spec = options[:spec] || "600:610:611:630:648:650:651:654:662"
529
532
  subd_separator = options[:subdivison_separator] || " — "
@@ -540,17 +543,17 @@ module Traject::Macros
540
543
  end
541
544
 
542
545
  # Takes a MARC::Field and formats it into a pre-coordinated LCSH string
543
- # with subdivision seperators in the right place.
546
+ # with subdivision seperators in the right place.
544
547
  #
545
548
  # For 600 fields especially, need to not just join with subdivision seperator
546
549
  # to take acount of $a$d$t -- for other fields, might be able to just
547
- # join subfields, not sure.
550
+ # join subfields, not sure.
548
551
  #
549
552
  # WILL strip trailing period from generated string, contrary to some LCSH practice.
550
553
  # Our data is inconsistent on whether it has period or not, this was
551
- # the easiest way to standardize.
554
+ # the easiest way to standardize.
552
555
  #
553
- # Default subdivision seperator is em-dash with spaces, set to '--' if you want.
556
+ # Default subdivision seperator is em-dash with spaces, set to '--' if you want.
554
557
  #
555
558
  # Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
556
559
  # is not carried in the MARC record. It may be system generated as a display constant
@@ -115,9 +115,15 @@ module Traject
115
115
  # @returns [HTTP::Client] from http.rb gem
116
116
  def http_client
117
117
  @http_client ||= begin
118
- # timeout setting on http.rb seems to be a mess.
119
- # https://github.com/httprb/http/issues/488
120
- client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
118
+ client = nil
119
+
120
+ if HTTP::VERSION.split(".").first.to_i > 3
121
+ client = HTTP.timeout(timeout)
122
+ else
123
+ # timeout setting on http.rb 3.x are a bit of a mess.
124
+ # https://github.com/httprb/http/issues/488
125
+ client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
126
+ end
121
127
 
122
128
  if settings["oai_pmh.try_gzip"]
123
129
  client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.2.0"
2
+ VERSION = "3.3.0"
3
3
  end
@@ -24,40 +24,40 @@ describe "Delimited/CSV Writers" do
24
24
 
25
25
  it "creates a dw with defaults" do
26
26
  dw = Traject::DelimitedWriter.new(@settings)
27
- dw.delimiter.must_equal "\t"
28
- dw.internal_delimiter.must_equal '|'
29
- dw.edelim.must_equal ' '
30
- dw.eidelim.must_equal '\\|'
27
+ assert_equal dw.delimiter, "\t"
28
+ assert_equal dw.internal_delimiter, '|'
29
+ assert_equal dw.edelim, ' '
30
+ assert_equal dw.eidelim, '\\|'
31
31
  end
32
32
 
33
33
  it "respects different delimiter" do
34
34
  @settings['delimited_writer.delimiter'] = '^'
35
35
  dw = Traject::DelimitedWriter.new(@settings)
36
- dw.delimiter.must_equal '^'
37
- dw.edelim.must_equal '\\^'
38
- dw.internal_delimiter.must_equal '|'
36
+ assert_equal dw.delimiter, '^'
37
+ assert_equal dw.edelim, '\\^'
38
+ assert_equal dw.internal_delimiter, '|'
39
39
  end
40
40
 
41
41
  it "outputs a header if asked to" do
42
42
  Traject::DelimitedWriter.new(@settings)
43
- @out.string.chomp.must_equal %w[four one two].join("\t")
43
+ assert_equal @out.string.chomp, %w[four one two].join("\t")
44
44
  end
45
45
 
46
46
  it "doesn't output a header if asked not to" do
47
47
  @settings['delimited_writer.header'] = 'false'
48
48
  Traject::DelimitedWriter.new(@settings)
49
- @out.string.must_be_empty
49
+ assert_empty @out.string
50
50
  end
51
51
 
52
52
  it "deals with multiple values" do
53
53
  dw = Traject::DelimitedWriter.new(@settings)
54
54
  dw.put @context
55
- @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
55
+ assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(dw.delimiter)
56
56
  end
57
57
 
58
58
  it "bails if delimited_writer.fields isn't set" do
59
59
  @settings.delete 'delimited_writer.fields'
60
- proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
60
+ assert_raises(ArgumentError) { Traject::DelimitedWriter.new(@settings) }
61
61
  end
62
62
 
63
63
  end
@@ -65,18 +65,18 @@ describe "Delimited/CSV Writers" do
65
65
  describe "Traject::CSVWriter" do
66
66
  it "unsets the delimiter" do
67
67
  cw = Traject::CSVWriter.new(@settings)
68
- cw.delimiter.must_be_nil
68
+ assert_nil cw.delimiter
69
69
  end
70
70
 
71
71
  it "writes the header" do
72
72
  Traject::CSVWriter.new(@settings)
73
- @out.string.chomp.must_equal 'four,one,two'
73
+ assert_equal @out.string.chomp, 'four,one,two'
74
74
  end
75
75
 
76
76
  it "uses the internal delimiter" do
77
77
  cw = Traject::CSVWriter.new(@settings)
78
78
  cw.put @context
79
- @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
79
+ assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(',')
80
80
  end
81
81
 
82
82
  it "produces complex output" do
@@ -97,8 +97,6 @@ describe "Delimited/CSV Writers" do
97
97
  traject_csvwriter_output = @out.string.split("\n").last.chomp
98
98
 
99
99
  assert_equal(csv_output, traject_csvwriter_output)
100
-
101
100
  end
102
-
103
101
  end
104
102
  end
@@ -25,6 +25,7 @@ describe "Class-level configuration of Indexer sub-class" do
25
25
  end
26
26
  end
27
27
 
28
+
28
29
  before do
29
30
  @indexer = TestIndexerSubclass.new
30
31
  end
@@ -47,6 +48,28 @@ describe "Class-level configuration of Indexer sub-class" do
47
48
  assert_equal ['from-instance-config'], result["instance_field"]
48
49
  end
49
50
 
51
+ describe "multiple class-level configure" do
52
+ class MultipleConfigureIndexer < Traject::Indexer
53
+ configure do
54
+ to_field "field", literal("value")
55
+ end
56
+ configure do
57
+ to_field "field", literal("value from second configure")
58
+ to_field "second_call", literal("value from second configure")
59
+ end
60
+ end
61
+
62
+ before do
63
+ @indexer = MultipleConfigureIndexer.new
64
+ end
65
+
66
+ it "lets you call class-level configure multiple times and aggregates" do
67
+ result = @indexer.map_record(Object.new)
68
+ assert_equal ['value', 'value from second configure'], result['field']
69
+ assert_equal ['value from second configure'], result['second_call']
70
+ end
71
+ end
72
+
50
73
  describe "with multi-level subclass" do
51
74
  class TestIndexerSubclassSubclass < TestIndexerSubclass
52
75
  configure do
@@ -197,6 +197,10 @@ describe "Traject::Macros::Marc21Semantics" do
197
197
  # we take the first date. And need to deal with the u.
198
198
  assert_equal 1845, Marc21Semantics.publication_date(@record)
199
199
  end
200
+ it "resorts to 264c" do
201
+ @record = MARC::Reader.new(support_file_path "date_resort_to_264.marc").to_a.first
202
+ assert_equal 2015, Marc21Semantics.publication_date(@record)
203
+ end
200
204
  it "resorts to 260c" do
201
205
  @record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
202
206
  assert_equal 1980, Marc21Semantics.publication_date(@record)
@@ -0,0 +1 @@
1
+ 01180aam a2200337 a 4500001001000000008004100010015001900051016001800070020002500088020002200113040005900135043001200194050002400206082001400230100003500244245006400279260003800343264003800381264001100419300003600430336002100466336002800487337002500515338002300540504006700563651004000630651005000670651004300720651005600763035002300819a11417842130723t20uu20uuenkb b 001 0 eng d aGBB3854302bnb7 a0164999372Uk a9781849043427 (pbk.) a1849043426 (pbk.) aUKMGBcUKMGBdOCLCOdYDXCPdOCLCOdZWZdOCLCOdCaONFJC aa-ii--- 4aDS485.K25bS64 201504a954.62231 aSnedden, Christopher,eauthor.10aUnderstanding Kashmir and Kashmiris /cChristopher Snedden. 1aLondon :bHurst & Company,c2014. 1aLondon :bHurst & Company,c2015. 4c©2015 axix, 372 pages :bmaps ;c22 cm atext2rdacontent astill image2rdacontent aunmediated2rdamedia avolume2rdacarrier aIncludes bibliographical references (pages 331-355) and index. 0aJammu and Kashmir (India)xHistory. 0aJammu and Kashmir (India)xForeign relations. 0aJammu and Kashmir (India)vBoundaries. 0aJammu and Kashmir (India)xPolitics and government. a(OCoLC-M)858826393
@@ -24,12 +24,12 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "concurrent-ruby", ">= 0.8.0"
25
25
  spec.add_dependency "marc", "~> 1.0"
26
26
 
27
- spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
27
+ spec.add_dependency "hashie", ">= 3.1", "< 5" # used for Indexer#settings
28
28
  spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
29
29
  spec.add_dependency "yell" # logging
30
30
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
31
31
  spec.add_dependency "httpclient", "~> 2.5"
32
- spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
32
+ spec.add_dependency "http", ">= 3.0", "< 5" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
33
33
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
34
34
  spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
35
35
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-09-10 00:00:00.000000000 Z
12
+ date: 2019-12-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -43,16 +43,22 @@ dependencies:
43
43
  name: hashie
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
- - - "~>"
46
+ - - ">="
47
47
  - !ruby/object:Gem::Version
48
48
  version: '3.1'
49
+ - - "<"
50
+ - !ruby/object:Gem::Version
51
+ version: '5'
49
52
  type: :runtime
50
53
  prerelease: false
51
54
  version_requirements: !ruby/object:Gem::Requirement
52
55
  requirements:
53
- - - "~>"
56
+ - - ">="
54
57
  - !ruby/object:Gem::Version
55
58
  version: '3.1'
59
+ - - "<"
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
56
62
  - !ruby/object:Gem::Dependency
57
63
  name: slop
58
64
  requirement: !ruby/object:Gem::Requirement
@@ -119,16 +125,22 @@ dependencies:
119
125
  name: http
120
126
  requirement: !ruby/object:Gem::Requirement
121
127
  requirements:
122
- - - "~>"
128
+ - - ">="
123
129
  - !ruby/object:Gem::Version
124
130
  version: '3.0'
131
+ - - "<"
132
+ - !ruby/object:Gem::Version
133
+ version: '5'
125
134
  type: :runtime
126
135
  prerelease: false
127
136
  version_requirements: !ruby/object:Gem::Requirement
128
137
  requirements:
129
- - - "~>"
138
+ - - ">="
130
139
  - !ruby/object:Gem::Version
131
140
  version: '3.0'
141
+ - - "<"
142
+ - !ruby/object:Gem::Version
143
+ version: '5'
132
144
  - !ruby/object:Gem::Dependency
133
145
  name: marc-fastxmlwriter
134
146
  requirement: !ruby/object:Gem::Requirement
@@ -330,6 +342,7 @@ files:
330
342
  - test/test_support/bad_subfield_code.marc
331
343
  - test/test_support/bad_utf_byte.utf8.marc
332
344
  - test/test_support/date_resort_to_260.marc
345
+ - test/test_support/date_resort_to_264.marc
333
346
  - test/test_support/date_type_r_missing_date2.marc
334
347
  - test/test_support/date_with_u.marc
335
348
  - test/test_support/demo_config.rb
@@ -434,6 +447,7 @@ test_files:
434
447
  - test/test_support/bad_subfield_code.marc
435
448
  - test/test_support/bad_utf_byte.utf8.marc
436
449
  - test/test_support/date_resort_to_260.marc
450
+ - test/test_support/date_resort_to_264.marc
437
451
  - test/test_support/date_type_r_missing_date2.marc
438
452
  - test/test_support/date_with_u.marc
439
453
  - test/test_support/demo_config.rb