mods 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c05154eec0a38fc2a0ad117d1c47fb336c116edf
4
- data.tar.gz: 9cbbfe083b89d92d0c18355be44136b82693e523
3
+ metadata.gz: b02fccf6e03afe427a8c23d821ebb2692b635f6d
4
+ data.tar.gz: d7f7fd87cd6ba14ab473dea40383f550f4cfa1e2
5
5
  SHA512:
6
- metadata.gz: 7d2ebefc59ba8cf3f6cdd816d429c45b238181158840aef30b34c806830d3c33de5dab7fcc76be3527a6f40ffe174e2087fc3e43ad17fe236a4d38e06f990c49
7
- data.tar.gz: 6ccff614c507ec6f3f541793d0407d90ef90ea8af5d38cb038cfdf560514ec4320e88ee74de60d5647db34f3e5de4b71aec071c9fcd8db7d8610a0ce22543a88
6
+ metadata.gz: e56288c902a448a532169703b9b8b49f95cc3e75a8b0351e8b893acf184caaf3090127b5f33c3c88ecaa0258afdc1d8b3312f963ea50aaaa7943bd7a97724c0a
7
+ data.tar.gz: 1aed6aa3932542f098d7a200f7c94f4210b3151f308da23eb668f01ce1f9c3a03028cb8af2206d6cef1b632149370bbdd815dfa37c7a02e8aa42ccc042c996d2
@@ -23,14 +23,21 @@ module Mods
23
23
  # when 'temper'
24
24
  # Mods::Date::TemperFormat.new(xml)
25
25
  else
26
- date_class = [
26
+ date_class = Mods::Date if xml.text =~ /\p{Hebrew}/
27
+ date_class ||= [
27
28
  MMDDYYYYFormat,
28
29
  MMDDYYFormat,
30
+ YearRangeFormat,
31
+ DecadeAsYearDashFormat,
32
+ EmbeddedBCYearFormat,
29
33
  EmbeddedYearFormat,
34
+ EmbeddedThreeDigitYearFormat,
35
+ EmbeddedYearWithBracketsFormat,
36
+ MysteryCenturyFormat,
37
+ CenturyFormat,
30
38
  RomanNumeralCenturyFormat,
31
39
  RomanNumeralYearFormat,
32
- MysteryCenturyFormat,
33
- CenturyFormat
40
+ OneOrTwoDigitYearFormat
34
41
  ].select { |klass| klass.supports? xml.text }.first
35
42
 
36
43
  (date_class || Mods::Date).new(xml)
@@ -42,7 +49,7 @@ module Mods
42
49
  # Strict ISO8601-encoded date parser
43
50
  class Iso8601Format < Date
44
51
  def self.parse_date(text)
45
- @date = ::Date.parse(cleanup(text))
52
+ @date = ::Date.parse(normalize_to_edtf(text))
46
53
  end
47
54
  end
48
55
 
@@ -54,18 +61,18 @@ module Mods
54
61
  class EdtfFormat < Date
55
62
  attr_reader :date
56
63
 
57
- def self.cleanup(text)
64
+ def self.normalize_to_edtf(text)
58
65
  text
59
66
  end
60
67
  end
61
68
 
62
69
  # MARC-formatted date parser, similar to EDTF, but with special support for
63
70
  # MARC-specific encodings
64
- class MarcFormat < EdtfFormat
65
- def self.cleanup(text)
66
- return nil if text == "9999" || text == "uuuu"
71
+ class MarcFormat < Date
72
+ def self.normalize_to_edtf(text)
73
+ return nil if text == "9999" || text == "uuuu" || text == '||||'
67
74
 
68
- text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
75
+ super
69
76
  end
70
77
 
71
78
  private
@@ -93,13 +100,13 @@ module Mods
93
100
  end
94
101
  end
95
102
 
96
- # Full text extractor for MM/DD/YYYY-formatted dates
103
+ # Full text extractor for MM/DD/YYYY and MM/DD/YYY-formatted dates
97
104
  class MMDDYYYYFormat < ExtractorDateFormat
98
- REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{4})/
105
+ REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{3,4})/
99
106
 
100
- def self.cleanup(text)
107
+ def self.normalize_to_edtf(text)
101
108
  matches = text.match(self::REGEX)
102
- "#{matches[:year].rjust(2, "0")}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
109
+ "#{matches[:year].rjust(4, "0")}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
103
110
  end
104
111
  end
105
112
 
@@ -107,7 +114,7 @@ module Mods
107
114
  class MMDDYYFormat < ExtractorDateFormat
108
115
  REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{2})/
109
116
 
110
- def self.cleanup(text)
117
+ def self.normalize_to_edtf(text)
111
118
  matches = text.match(self::REGEX)
112
119
  year = munge_to_yyyy(matches[:year])
113
120
  "#{year}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
@@ -124,15 +131,15 @@ module Mods
124
131
 
125
132
  # Full-text extractor for dates encoded as Roman numerals
126
133
  class RomanNumeralYearFormat < ExtractorDateFormat
127
- REGEX = /^(?<year>[MCDLXVI]+)/
134
+ REGEX = /(?<![A-Za-z\.])(?<year>[MCDLXVI\.]+)(?![A-Za-z])/
128
135
 
129
- def self.cleanup(text)
136
+ def self.normalize_to_edtf(text)
130
137
  matches = text.match(REGEX)
131
138
  roman_to_int(matches[:year].upcase).to_s
132
139
  end
133
140
 
134
141
  def self.roman_to_int(value)
135
- value = value.dup
142
+ value = value.tr('.', '')
136
143
  map = { "M"=>1000, "CM"=>900, "D"=>500, "CD"=>400, "C"=>100, "XC"=>90, "L"=>50, "XL"=>40, "X"=>10, "IX"=>9, "V"=>5, "IV"=>4, "I"=>1 }
137
144
  result = 0
138
145
  map.each do |k,v|
@@ -145,11 +152,12 @@ module Mods
145
152
  end
146
153
  end
147
154
 
148
- # Full-text extractor for centuries encoded as Roman numerals
155
+ # Full-text extractor for centuries encoded as Roman numerals; sometimes centuries
156
+ # are given as e.g. xvith, hence the funny negative look-ahead assertion
149
157
  class RomanNumeralCenturyFormat < RomanNumeralYearFormat
150
- REGEX = /(cent. )?(?<century>[xvi]+)/
158
+ REGEX = /(?<![a-z])(?<century>[xvi]+)(?![a-su-z])/
151
159
 
152
- def self.cleanup(text)
160
+ def self.normalize_to_edtf(text)
153
161
  matches = text.match(REGEX)
154
162
  munge_to_yyyy(matches[:century])
155
163
  end
@@ -165,7 +173,7 @@ module Mods
165
173
  # of unknown origin.
166
174
  class MysteryCenturyFormat < ExtractorDateFormat
167
175
  REGEX = /(?<century>\d{2})--/
168
- def self.cleanup(text)
176
+ def self.normalize_to_edtf(text)
169
177
  matches = text.match(REGEX)
170
178
  "#{matches[:century]}XX"
171
179
  end
@@ -175,22 +183,83 @@ module Mods
175
183
  class CenturyFormat < ExtractorDateFormat
176
184
  REGEX = /(?<century>\d{2})th C(entury)?/i
177
185
 
178
- def self.cleanup(text)
186
+ def self.normalize_to_edtf(text)
179
187
  matches = text.match(REGEX)
180
188
  "#{matches[:century].to_i - 1}XX"
181
189
  end
182
190
  end
183
191
 
192
+ # Full-text extractor for data formatted as YYYY-YYYY or YYY-YYY
193
+ class YearRangeFormat < ExtractorDateFormat
194
+ REGEX = /(?<start>\d{3,4})-(?<end>\d{3,4})/
195
+
196
+ def self.normalize_to_edtf(text)
197
+ matches = text.match(REGEX)
198
+ "#{matches[:start].rjust(4, "0")}/#{matches[:end].rjust(4, "0")}"
199
+ end
200
+ end
201
+
202
+ # Full-text extractor for data formatted as YYY-
203
+ class DecadeAsYearDashFormat < ExtractorDateFormat
204
+ REGEX = /(?<!\d)(?<year>\d{3})[-_x?](?!\d)/
205
+
206
+ def self.normalize_to_edtf(text)
207
+ matches = text.match(REGEX)
208
+ "#{matches[:year]}X"
209
+ end
210
+ end
211
+
212
+ # Full-text extractor that tries hard to pick any year present in the data
213
+ class EmbeddedBCYearFormat < ExtractorDateFormat
214
+ REGEX = /(?<year>\d{3,4})\s?B\.?C\.?/i
215
+
216
+ def self.normalize_to_edtf(text)
217
+ matches = text.match(REGEX)
218
+ "-#{(matches[:year].to_i - 1).to_s.rjust(4, "0")}"
219
+ end
220
+ end
221
+
184
222
  # Full-text extractor that tries hard to pick any year present in the data
185
223
  class EmbeddedYearFormat < ExtractorDateFormat
186
- REGEX = /(?<prefix>-)?(?<year>\d{3,4})/
224
+ REGEX = /(?<prefix>-)?(?<!\d)(?<year>\d{4})(?!\d)/
187
225
 
188
- def self.cleanup(text)
226
+ def self.normalize_to_edtf(text)
189
227
  matches = text.match(REGEX)
190
228
  "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
191
229
  end
192
230
  end
193
231
 
232
+ # Full-text extractor that tries hard to pick any year present in the data
233
+ class EmbeddedThreeDigitYearFormat < ExtractorDateFormat
234
+ REGEX = /(?<prefix>-)?(?<!\d)(?<year>\d{3})(?!\d)(?!\d)/
235
+
236
+ def self.normalize_to_edtf(text)
237
+ matches = text.match(REGEX)
238
+ "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
239
+ end
240
+ end
241
+
242
+ # Full-text extractor that tries hard to pick any year present in the data
243
+ class OneOrTwoDigitYearFormat < ExtractorDateFormat
244
+ REGEX = /^(?<prefix>-)?(?<year>\d{1,2})$/
245
+
246
+ def self.normalize_to_edtf(text)
247
+ matches = text.match(REGEX)
248
+ "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
249
+ end
250
+ end
251
+
252
+ # Full-text extractor that tries hard to pick any year present in the data
253
+ class EmbeddedYearWithBracketsFormat < ExtractorDateFormat
254
+ # [YYY]Y Y[YYY] [YY]YY Y[YY]Y YY[YY] YYY[Y] YY[Y]Y Y[Y]YY [Y]YYY
255
+ REGEX = /(?<prefix>-)?(?<year>[\d\[\]]{6})(?!\d)/
256
+
257
+ def self.normalize_to_edtf(text)
258
+ matches = text.match(REGEX)
259
+ "#{matches[:prefix]}#{matches[:year].gsub('[', '').gsub(']', '')}"
260
+ end
261
+ end
262
+
194
263
  attr_reader :date
195
264
 
196
265
  ##
@@ -199,20 +268,24 @@ module Mods
199
268
  # @param [String] text
200
269
  # @return [Date]
201
270
  def self.parse_date(text)
202
- ::Date.edtf(cleanup(text))
271
+ return nil if text == '0000-00-00'
272
+ ::Date.edtf(normalize_to_edtf(text))
203
273
  end
204
274
 
205
275
  ##
206
276
  # Apply any encoding-specific munging or text extraction logic
207
277
  # @param [String] text
208
278
  # @return [String]
209
- def self.cleanup(text)
210
- text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
279
+ def self.normalize_to_edtf(text)
280
+ sanitized = text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
281
+ sanitized = text.rjust(4, "0") if text =~ /^\d{3}$/
282
+
283
+ sanitized
211
284
  end
212
285
 
213
286
  def initialize(xml)
214
287
  @xml = xml
215
- @date = self.class.parse_date(xml.text)
288
+ @date = self.class.parse_date(xml.text.strip)
216
289
  end
217
290
 
218
291
  ##
@@ -265,6 +338,14 @@ module Mods
265
338
  xml.attr(:encoding)
266
339
  end
267
340
 
341
+ ##
342
+ # Is the date marked as a keyDate?
343
+ #
344
+ # @return [Boolean]
345
+ def key?
346
+ xml.attr(:keyDate) == 'yes'
347
+ end
348
+
268
349
  ##
269
350
  # Was an encoding provided?
270
351
  #
@@ -337,6 +418,25 @@ module Mods
337
418
  qualifier == 'questionable'
338
419
  end
339
420
 
421
+ def precision
422
+ if date_range.is_a? EDTF::Century
423
+ :century
424
+ elsif date_range.is_a? EDTF::Decade
425
+ :decade
426
+ else
427
+ case date.precision
428
+ when :month
429
+ date.unspecified.unspecified?(:month) ? :year : :month
430
+ when :day
431
+ d = date.unspecified.unspecified?(:day) ? :month : :day
432
+ d = date.unspecified.unspecified?(:month) ? :year : d
433
+ d
434
+ else
435
+ date.precision
436
+ end
437
+ end
438
+ end
439
+
340
440
  private
341
441
 
342
442
  def days_in_month(month, year)
@@ -356,15 +456,18 @@ module Mods
356
456
  return nil if date.nil?
357
457
 
358
458
  case date_range
459
+ when EDTF::Unknown
460
+ nil
359
461
  when EDTF::Epoch, EDTF::Interval
360
462
  date_range.min
361
463
  when EDTF::Set
362
464
  date_range.to_a.first
363
465
  else
364
466
  d = date.dup
365
- d = d.change(month: 1, day: 1) if date.unspecified.unspecified?(:year) || date.precision == :year
366
- d = d.change(month: 1) if date.unspecified.unspecified?(:month) || date.precision == :year
367
- d = d.change(day: 1) if date.unspecified.unspecified?(:day) || date.precision == :month
467
+ d = d.change(month: 1, day: 1) if date.precision == :year
468
+ d = d.change(day: 1) if date.precision == :month
469
+ d = d.change(month: 1) if date.unspecified.unspecified? :month
470
+ d = d.change(day: 1) if date.unspecified.unspecified? :day
368
471
  d
369
472
  end
370
473
  end
@@ -376,16 +479,20 @@ module Mods
376
479
  # @return [::Date]
377
480
  def latest_date
378
481
  return nil if date.nil?
482
+
379
483
  case date_range
484
+ when EDTF::Unknown
485
+ nil
380
486
  when EDTF::Epoch, EDTF::Interval
381
487
  date_range.max
382
488
  when EDTF::Set
383
489
  date_range.to_a.last.change(month: 12, day: 31)
384
490
  else
385
491
  d = date.dup
386
- d = d.change(month: 12, day: 31) if date.unspecified.unspecified?(:year) || date.precision == :year
387
- d = d.change(month: 12) if date.unspecified.unspecified?(:month) || date.precision == :year
388
- d = d.change(day: days_in_month(date.month, date.year)) if date.unspecified.unspecified?(:day) || date.precision == :month
492
+ d = d.change(month: 12, day: 31) if date.precision == :year
493
+ d = d.change(day: days_in_month(date.month, date.year)) if date.precision == :month
494
+ d = d.change(month: 12) if date.unspecified.unspecified? :month
495
+ d = d.change(day: days_in_month(date.month, date.year)) if date.unspecified.unspecified? :day
389
496
  d
390
497
  end
391
498
  end
@@ -1,4 +1,4 @@
1
1
  module Mods
2
2
  # this is the Ruby Gem version
3
- VERSION = "2.2.0"
3
+ VERSION = "2.3.0"
4
4
  end
@@ -72,6 +72,46 @@ RSpec.describe Mods::Date do
72
72
  end
73
73
  end
74
74
 
75
+ describe '#key?' do
76
+ context 'with keyDate=yes' do
77
+ let(:date_element) { "<dateCreated keyDate='yes'>1856</dateCreated>" }
78
+
79
+ it 'returns true' do
80
+ expect(date.key?).to eq true
81
+ end
82
+ end
83
+
84
+ context 'with a keyDate set to anything else' do
85
+ let(:date_element) { "<dateCreated keyDate='fictional'>1856</dateCreated>" }
86
+
87
+ it 'returns false' do
88
+ expect(date.key?).to eq false
89
+ end
90
+ end
91
+ end
92
+
93
+ describe '#precision' do
94
+ {
95
+ '1905' => :year,
96
+ '190u' => :decade,
97
+ '19uu' => :century,
98
+ '1900-uu' => :year,
99
+ '1900-uu-uu' => :year,
100
+ '1900-uu-15' => :year,
101
+ '1900-06' => :month,
102
+ '1900-06-uu' => :month,
103
+ '1900-06-15' => :day,
104
+ }.each do |data, expected|
105
+ describe "with #{data}" do
106
+ let(:date_element) { "<dateCreated encoding=\"edtf\">#{data}</dateCreated>" }
107
+
108
+ it "has the range #{expected}" do
109
+ expect(date.precision).to eq expected
110
+ end
111
+ end
112
+ end
113
+ end
114
+
75
115
  describe '#point' do
76
116
  let(:date_element) { "<dateCreated point='fictional'>1856</dateCreated>" }
77
117
 
@@ -188,7 +228,8 @@ RSpec.describe Mods::Date do
188
228
  '-1753' => Date.parse('-1753-01-01')..Date.parse('-1753-12-31'),
189
229
  '1992-05-06' => Date.parse('1992-05-06')..Date.parse('1992-05-06'),
190
230
  '1992-04' => Date.parse('1992-04-01')..Date.parse('1992-04-30'),
191
- '2004-02' => Date.parse('2004-02-01')..Date.parse('2004-02-29')
231
+ '2004-02' => Date.parse('2004-02-01')..Date.parse('2004-02-29'),
232
+ '123' => Date.parse('123-01-01')..Date.parse('123-12-31') # not technically valid, but we have lots of these
192
233
  }.each do |data, expected|
193
234
  describe "with #{data}" do
194
235
  let(:date_element) { "<dateCreated encoding=\"w3cdtf\">#{data}</dateCreated>" }
@@ -206,7 +247,8 @@ RSpec.describe Mods::Date do
206
247
  {
207
248
  '1234' => Date.parse('1234-01-01')..Date.parse('1234-12-31'),
208
249
  '9999' => nil,
209
- '1uuu' => Date.parse('1000-01-01')..Date.parse('1999-12-31')
250
+ '1uuu' => Date.parse('1000-01-01')..Date.parse('1999-12-31'),
251
+ '||||' => nil
210
252
  }.each do |data, expected|
211
253
  describe "with #{data}" do
212
254
  let(:date_element) { "<dateCreated encoding=\"marc\">#{data}</dateCreated>" }
@@ -243,7 +285,8 @@ RSpec.describe Mods::Date do
243
285
  '5/2/2017' => Date.parse('2017-05-02')..Date.parse('2017-05-02'),
244
286
  '12/1/2017' => Date.parse('2017-12-01')..Date.parse('2017-12-01'),
245
287
  '12/1/17' => Date.parse('2017-12-01')..Date.parse('2017-12-01'),
246
- '12/1/25' => Date.parse('1925-12-01')..Date.parse('1925-12-01')
288
+ '12/1/25' => Date.parse('1925-12-01')..Date.parse('1925-12-01'),
289
+ '6/18/938' => Date.parse('0938-06-18')..Date.parse('0938-06-18')
247
290
  }.each do |data, expected|
248
291
  describe "with #{data}" do
249
292
  let(:date_element) { "<dateCreated>#{data}</dateCreated>" }
@@ -269,7 +312,22 @@ RSpec.describe Mods::Date do
269
312
  'MDLXXVIII' => Date.parse('1578-01-01')..Date.parse('1578-12-31'),
270
313
  '[19--?]-' => Date.parse('1900-01-01')..Date.parse('1999-12-31'),
271
314
  '19th Century' => Date.parse('1800-01-01')..Date.parse('1899-12-31'),
272
- '19th c.' => Date.parse('1800-01-01')..Date.parse('1899-12-31')
315
+ '19th c.' => Date.parse('1800-01-01')..Date.parse('1899-12-31'),
316
+ 'mid to 2nd half of 13th century' => Date.parse('1200-01-01')..Date.parse('1299-12-31'),
317
+ '167-?]' => Date.parse('1670-01-01')..Date.parse('1679-12-31'),
318
+ '189-?' => Date.parse('1890-01-01')..Date.parse('1899-12-31'),
319
+ '193-' => Date.parse('1930-01-01')..Date.parse('1939-12-31'),
320
+ '196_' => Date.parse('1960-01-01')..Date.parse('1969-12-31'),
321
+ '196x' => Date.parse('1960-01-01')..Date.parse('1969-12-31'),
322
+ '186?' => Date.parse('1860-01-01')..Date.parse('1869-12-31'),
323
+ '1700?' => Date.parse('1700-01-01')..Date.parse('1700-12-31'),
324
+ '[1670-1684]' => Date.parse('1670-01-01')..Date.parse('1684-12-31'),
325
+ '[18]74' => Date.parse('1874-01-01')..Date.parse('1874-12-31'),
326
+ '250 B.C.' => Date.parse('-0249-01-01')..Date.parse('-249-12-31'),
327
+ 'Anno M.DC.LXXXI.' => Date.parse('1681-01-01')..Date.parse('1681-12-31'),
328
+ '624[1863 or 1864]' => Date.parse('1863-01-01')..Date.parse('1863-12-31'),
329
+ 'chez Villeneuve' => nil,
330
+ '‏4264681 או 368' => nil
273
331
  }.each do |data, expected|
274
332
  describe "with #{data}" do
275
333
  let(:date_element) { "<dateCreated>#{data}</dateCreated>" }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mods
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naomi Dushay
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-01 00:00:00.000000000 Z
12
+ date: 2017-12-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri