mods 2.2.0 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c05154eec0a38fc2a0ad117d1c47fb336c116edf
4
- data.tar.gz: 9cbbfe083b89d92d0c18355be44136b82693e523
3
+ metadata.gz: b02fccf6e03afe427a8c23d821ebb2692b635f6d
4
+ data.tar.gz: d7f7fd87cd6ba14ab473dea40383f550f4cfa1e2
5
5
  SHA512:
6
- metadata.gz: 7d2ebefc59ba8cf3f6cdd816d429c45b238181158840aef30b34c806830d3c33de5dab7fcc76be3527a6f40ffe174e2087fc3e43ad17fe236a4d38e06f990c49
7
- data.tar.gz: 6ccff614c507ec6f3f541793d0407d90ef90ea8af5d38cb038cfdf560514ec4320e88ee74de60d5647db34f3e5de4b71aec071c9fcd8db7d8610a0ce22543a88
6
+ metadata.gz: e56288c902a448a532169703b9b8b49f95cc3e75a8b0351e8b893acf184caaf3090127b5f33c3c88ecaa0258afdc1d8b3312f963ea50aaaa7943bd7a97724c0a
7
+ data.tar.gz: 1aed6aa3932542f098d7a200f7c94f4210b3151f308da23eb668f01ce1f9c3a03028cb8af2206d6cef1b632149370bbdd815dfa37c7a02e8aa42ccc042c996d2
@@ -23,14 +23,21 @@ module Mods
23
23
  # when 'temper'
24
24
  # Mods::Date::TemperFormat.new(xml)
25
25
  else
26
- date_class = [
26
+ date_class = Mods::Date if xml.text =~ /\p{Hebrew}/
27
+ date_class ||= [
27
28
  MMDDYYYYFormat,
28
29
  MMDDYYFormat,
30
+ YearRangeFormat,
31
+ DecadeAsYearDashFormat,
32
+ EmbeddedBCYearFormat,
29
33
  EmbeddedYearFormat,
34
+ EmbeddedThreeDigitYearFormat,
35
+ EmbeddedYearWithBracketsFormat,
36
+ MysteryCenturyFormat,
37
+ CenturyFormat,
30
38
  RomanNumeralCenturyFormat,
31
39
  RomanNumeralYearFormat,
32
- MysteryCenturyFormat,
33
- CenturyFormat
40
+ OneOrTwoDigitYearFormat
34
41
  ].select { |klass| klass.supports? xml.text }.first
35
42
 
36
43
  (date_class || Mods::Date).new(xml)
@@ -42,7 +49,7 @@ module Mods
42
49
  # Strict ISO8601-encoded date parser
43
50
  class Iso8601Format < Date
44
51
  def self.parse_date(text)
45
- @date = ::Date.parse(cleanup(text))
52
+ @date = ::Date.parse(normalize_to_edtf(text))
46
53
  end
47
54
  end
48
55
 
@@ -54,18 +61,18 @@ module Mods
54
61
  class EdtfFormat < Date
55
62
  attr_reader :date
56
63
 
57
- def self.cleanup(text)
64
+ def self.normalize_to_edtf(text)
58
65
  text
59
66
  end
60
67
  end
61
68
 
62
69
  # MARC-formatted date parser, similar to EDTF, but with special support for
63
70
  # MARC-specific encodings
64
- class MarcFormat < EdtfFormat
65
- def self.cleanup(text)
66
- return nil if text == "9999" || text == "uuuu"
71
+ class MarcFormat < Date
72
+ def self.normalize_to_edtf(text)
73
+ return nil if text == "9999" || text == "uuuu" || text == '||||'
67
74
 
68
- text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
75
+ super
69
76
  end
70
77
 
71
78
  private
@@ -93,13 +100,13 @@ module Mods
93
100
  end
94
101
  end
95
102
 
96
- # Full text extractor for MM/DD/YYYY-formatted dates
103
+ # Full text extractor for MM/DD/YYYY and MM/DD/YYY-formatted dates
97
104
  class MMDDYYYYFormat < ExtractorDateFormat
98
- REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{4})/
105
+ REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{3,4})/
99
106
 
100
- def self.cleanup(text)
107
+ def self.normalize_to_edtf(text)
101
108
  matches = text.match(self::REGEX)
102
- "#{matches[:year].rjust(2, "0")}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
109
+ "#{matches[:year].rjust(4, "0")}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
103
110
  end
104
111
  end
105
112
 
@@ -107,7 +114,7 @@ module Mods
107
114
  class MMDDYYFormat < ExtractorDateFormat
108
115
  REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{2})/
109
116
 
110
- def self.cleanup(text)
117
+ def self.normalize_to_edtf(text)
111
118
  matches = text.match(self::REGEX)
112
119
  year = munge_to_yyyy(matches[:year])
113
120
  "#{year}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
@@ -124,15 +131,15 @@ module Mods
124
131
 
125
132
  # Full-text extractor for dates encoded as Roman numerals
126
133
  class RomanNumeralYearFormat < ExtractorDateFormat
127
- REGEX = /^(?<year>[MCDLXVI]+)/
134
+ REGEX = /(?<![A-Za-z\.])(?<year>[MCDLXVI\.]+)(?![A-Za-z])/
128
135
 
129
- def self.cleanup(text)
136
+ def self.normalize_to_edtf(text)
130
137
  matches = text.match(REGEX)
131
138
  roman_to_int(matches[:year].upcase).to_s
132
139
  end
133
140
 
134
141
  def self.roman_to_int(value)
135
- value = value.dup
142
+ value = value.tr('.', '')
136
143
  map = { "M"=>1000, "CM"=>900, "D"=>500, "CD"=>400, "C"=>100, "XC"=>90, "L"=>50, "XL"=>40, "X"=>10, "IX"=>9, "V"=>5, "IV"=>4, "I"=>1 }
137
144
  result = 0
138
145
  map.each do |k,v|
@@ -145,11 +152,12 @@ module Mods
145
152
  end
146
153
  end
147
154
 
148
- # Full-text extractor for centuries encoded as Roman numerals
155
+ # Full-text extractor for centuries encoded as Roman numerals; sometimes centuries
156
+ # are given as e.g. xvith, hence the funny negative look-ahead assertion
149
157
  class RomanNumeralCenturyFormat < RomanNumeralYearFormat
150
- REGEX = /(cent. )?(?<century>[xvi]+)/
158
+ REGEX = /(?<![a-z])(?<century>[xvi]+)(?![a-su-z])/
151
159
 
152
- def self.cleanup(text)
160
+ def self.normalize_to_edtf(text)
153
161
  matches = text.match(REGEX)
154
162
  munge_to_yyyy(matches[:century])
155
163
  end
@@ -165,7 +173,7 @@ module Mods
165
173
  # of unknown origin.
166
174
  class MysteryCenturyFormat < ExtractorDateFormat
167
175
  REGEX = /(?<century>\d{2})--/
168
- def self.cleanup(text)
176
+ def self.normalize_to_edtf(text)
169
177
  matches = text.match(REGEX)
170
178
  "#{matches[:century]}XX"
171
179
  end
@@ -175,22 +183,83 @@ module Mods
175
183
  class CenturyFormat < ExtractorDateFormat
176
184
  REGEX = /(?<century>\d{2})th C(entury)?/i
177
185
 
178
- def self.cleanup(text)
186
+ def self.normalize_to_edtf(text)
179
187
  matches = text.match(REGEX)
180
188
  "#{matches[:century].to_i - 1}XX"
181
189
  end
182
190
  end
183
191
 
192
+ # Full-text extractor for data formatted as YYYY-YYYY or YYY-YYY
193
+ class YearRangeFormat < ExtractorDateFormat
194
+ REGEX = /(?<start>\d{3,4})-(?<end>\d{3,4})/
195
+
196
+ def self.normalize_to_edtf(text)
197
+ matches = text.match(REGEX)
198
+ "#{matches[:start].rjust(4, "0")}/#{matches[:end].rjust(4, "0")}"
199
+ end
200
+ end
201
+
202
+ # Full-text extractor for data formatted as YYY-
203
+ class DecadeAsYearDashFormat < ExtractorDateFormat
204
+ REGEX = /(?<!\d)(?<year>\d{3})[-_x?](?!\d)/
205
+
206
+ def self.normalize_to_edtf(text)
207
+ matches = text.match(REGEX)
208
+ "#{matches[:year]}X"
209
+ end
210
+ end
211
+
212
+ # Full-text extractor that tries hard to pick any year present in the data
213
+ class EmbeddedBCYearFormat < ExtractorDateFormat
214
+ REGEX = /(?<year>\d{3,4})\s?B\.?C\.?/i
215
+
216
+ def self.normalize_to_edtf(text)
217
+ matches = text.match(REGEX)
218
+ "-#{(matches[:year].to_i - 1).to_s.rjust(4, "0")}"
219
+ end
220
+ end
221
+
184
222
  # Full-text extractor that tries hard to pick any year present in the data
185
223
  class EmbeddedYearFormat < ExtractorDateFormat
186
- REGEX = /(?<prefix>-)?(?<year>\d{3,4})/
224
+ REGEX = /(?<prefix>-)?(?<!\d)(?<year>\d{4})(?!\d)/
187
225
 
188
- def self.cleanup(text)
226
+ def self.normalize_to_edtf(text)
189
227
  matches = text.match(REGEX)
190
228
  "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
191
229
  end
192
230
  end
193
231
 
232
+ # Full-text extractor that tries hard to pick any year present in the data
233
+ class EmbeddedThreeDigitYearFormat < ExtractorDateFormat
234
+ REGEX = /(?<prefix>-)?(?<!\d)(?<year>\d{3})(?!\d)(?!\d)/
235
+
236
+ def self.normalize_to_edtf(text)
237
+ matches = text.match(REGEX)
238
+ "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
239
+ end
240
+ end
241
+
242
+ # Full-text extractor that tries hard to pick any year present in the data
243
+ class OneOrTwoDigitYearFormat < ExtractorDateFormat
244
+ REGEX = /^(?<prefix>-)?(?<year>\d{1,2})$/
245
+
246
+ def self.normalize_to_edtf(text)
247
+ matches = text.match(REGEX)
248
+ "#{matches[:prefix]}#{matches[:year].rjust(4, "0")}"
249
+ end
250
+ end
251
+
252
+ # Full-text extractor that tries hard to pick any year present in the data
253
+ class EmbeddedYearWithBracketsFormat < ExtractorDateFormat
254
+ # [YYY]Y Y[YYY] [YY]YY Y[YY]Y YY[YY] YYY[Y] YY[Y]Y Y[Y]YY [Y]YYY
255
+ REGEX = /(?<prefix>-)?(?<year>[\d\[\]]{6})(?!\d)/
256
+
257
+ def self.normalize_to_edtf(text)
258
+ matches = text.match(REGEX)
259
+ "#{matches[:prefix]}#{matches[:year].gsub('[', '').gsub(']', '')}"
260
+ end
261
+ end
262
+
194
263
  attr_reader :date
195
264
 
196
265
  ##
@@ -199,20 +268,24 @@ module Mods
199
268
  # @param [String] text
200
269
  # @return [Date]
201
270
  def self.parse_date(text)
202
- ::Date.edtf(cleanup(text))
271
+ return nil if text == '0000-00-00'
272
+ ::Date.edtf(normalize_to_edtf(text))
203
273
  end
204
274
 
205
275
  ##
206
276
  # Apply any encoding-specific munging or text extraction logic
207
277
  # @param [String] text
208
278
  # @return [String]
209
- def self.cleanup(text)
210
- text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
279
+ def self.normalize_to_edtf(text)
280
+ sanitized = text.gsub(/^[\[]+/, '').gsub(/[\.\]]+$/, '')
281
+ sanitized = text.rjust(4, "0") if text =~ /^\d{3}$/
282
+
283
+ sanitized
211
284
  end
212
285
 
213
286
  def initialize(xml)
214
287
  @xml = xml
215
- @date = self.class.parse_date(xml.text)
288
+ @date = self.class.parse_date(xml.text.strip)
216
289
  end
217
290
 
218
291
  ##
@@ -265,6 +338,14 @@ module Mods
265
338
  xml.attr(:encoding)
266
339
  end
267
340
 
341
+ ##
342
+ # Is the date marked as a keyDate?
343
+ #
344
+ # @return [Boolean]
345
+ def key?
346
+ xml.attr(:keyDate) == 'yes'
347
+ end
348
+
268
349
  ##
269
350
  # Was an encoding provided?
270
351
  #
@@ -337,6 +418,25 @@ module Mods
337
418
  qualifier == 'questionable'
338
419
  end
339
420
 
421
+ def precision
422
+ if date_range.is_a? EDTF::Century
423
+ :century
424
+ elsif date_range.is_a? EDTF::Decade
425
+ :decade
426
+ else
427
+ case date.precision
428
+ when :month
429
+ date.unspecified.unspecified?(:month) ? :year : :month
430
+ when :day
431
+ d = date.unspecified.unspecified?(:day) ? :month : :day
432
+ d = date.unspecified.unspecified?(:month) ? :year : d
433
+ d
434
+ else
435
+ date.precision
436
+ end
437
+ end
438
+ end
439
+
340
440
  private
341
441
 
342
442
  def days_in_month(month, year)
@@ -356,15 +456,18 @@ module Mods
356
456
  return nil if date.nil?
357
457
 
358
458
  case date_range
459
+ when EDTF::Unknown
460
+ nil
359
461
  when EDTF::Epoch, EDTF::Interval
360
462
  date_range.min
361
463
  when EDTF::Set
362
464
  date_range.to_a.first
363
465
  else
364
466
  d = date.dup
365
- d = d.change(month: 1, day: 1) if date.unspecified.unspecified?(:year) || date.precision == :year
366
- d = d.change(month: 1) if date.unspecified.unspecified?(:month) || date.precision == :year
367
- d = d.change(day: 1) if date.unspecified.unspecified?(:day) || date.precision == :month
467
+ d = d.change(month: 1, day: 1) if date.precision == :year
468
+ d = d.change(day: 1) if date.precision == :month
469
+ d = d.change(month: 1) if date.unspecified.unspecified? :month
470
+ d = d.change(day: 1) if date.unspecified.unspecified? :day
368
471
  d
369
472
  end
370
473
  end
@@ -376,16 +479,20 @@ module Mods
376
479
  # @return [::Date]
377
480
  def latest_date
378
481
  return nil if date.nil?
482
+
379
483
  case date_range
484
+ when EDTF::Unknown
485
+ nil
380
486
  when EDTF::Epoch, EDTF::Interval
381
487
  date_range.max
382
488
  when EDTF::Set
383
489
  date_range.to_a.last.change(month: 12, day: 31)
384
490
  else
385
491
  d = date.dup
386
- d = d.change(month: 12, day: 31) if date.unspecified.unspecified?(:year) || date.precision == :year
387
- d = d.change(month: 12) if date.unspecified.unspecified?(:month) || date.precision == :year
388
- d = d.change(day: days_in_month(date.month, date.year)) if date.unspecified.unspecified?(:day) || date.precision == :month
492
+ d = d.change(month: 12, day: 31) if date.precision == :year
493
+ d = d.change(day: days_in_month(date.month, date.year)) if date.precision == :month
494
+ d = d.change(month: 12) if date.unspecified.unspecified? :month
495
+ d = d.change(day: days_in_month(date.month, date.year)) if date.unspecified.unspecified? :day
389
496
  d
390
497
  end
391
498
  end
@@ -1,4 +1,4 @@
1
1
  module Mods
2
2
  # this is the Ruby Gem version
3
- VERSION = "2.2.0"
3
+ VERSION = "2.3.0"
4
4
  end
@@ -72,6 +72,46 @@ RSpec.describe Mods::Date do
72
72
  end
73
73
  end
74
74
 
75
+ describe '#key?' do
76
+ context 'with keyDate=yes' do
77
+ let(:date_element) { "<dateCreated keyDate='yes'>1856</dateCreated>" }
78
+
79
+ it 'returns true' do
80
+ expect(date.key?).to eq true
81
+ end
82
+ end
83
+
84
+ context 'with a keyDate set to anything else' do
85
+ let(:date_element) { "<dateCreated keyDate='fictional'>1856</dateCreated>" }
86
+
87
+ it 'returns false' do
88
+ expect(date.key?).to eq false
89
+ end
90
+ end
91
+ end
92
+
93
+ describe '#precision' do
94
+ {
95
+ '1905' => :year,
96
+ '190u' => :decade,
97
+ '19uu' => :century,
98
+ '1900-uu' => :year,
99
+ '1900-uu-uu' => :year,
100
+ '1900-uu-15' => :year,
101
+ '1900-06' => :month,
102
+ '1900-06-uu' => :month,
103
+ '1900-06-15' => :day,
104
+ }.each do |data, expected|
105
+ describe "with #{data}" do
106
+ let(:date_element) { "<dateCreated encoding=\"edtf\">#{data}</dateCreated>" }
107
+
108
+ it "has the range #{expected}" do
109
+ expect(date.precision).to eq expected
110
+ end
111
+ end
112
+ end
113
+ end
114
+
75
115
  describe '#point' do
76
116
  let(:date_element) { "<dateCreated point='fictional'>1856</dateCreated>" }
77
117
 
@@ -188,7 +228,8 @@ RSpec.describe Mods::Date do
188
228
  '-1753' => Date.parse('-1753-01-01')..Date.parse('-1753-12-31'),
189
229
  '1992-05-06' => Date.parse('1992-05-06')..Date.parse('1992-05-06'),
190
230
  '1992-04' => Date.parse('1992-04-01')..Date.parse('1992-04-30'),
191
- '2004-02' => Date.parse('2004-02-01')..Date.parse('2004-02-29')
231
+ '2004-02' => Date.parse('2004-02-01')..Date.parse('2004-02-29'),
232
+ '123' => Date.parse('123-01-01')..Date.parse('123-12-31') # not technically valid, but we have lots of these
192
233
  }.each do |data, expected|
193
234
  describe "with #{data}" do
194
235
  let(:date_element) { "<dateCreated encoding=\"w3cdtf\">#{data}</dateCreated>" }
@@ -206,7 +247,8 @@ RSpec.describe Mods::Date do
206
247
  {
207
248
  '1234' => Date.parse('1234-01-01')..Date.parse('1234-12-31'),
208
249
  '9999' => nil,
209
- '1uuu' => Date.parse('1000-01-01')..Date.parse('1999-12-31')
250
+ '1uuu' => Date.parse('1000-01-01')..Date.parse('1999-12-31'),
251
+ '||||' => nil
210
252
  }.each do |data, expected|
211
253
  describe "with #{data}" do
212
254
  let(:date_element) { "<dateCreated encoding=\"marc\">#{data}</dateCreated>" }
@@ -243,7 +285,8 @@ RSpec.describe Mods::Date do
243
285
  '5/2/2017' => Date.parse('2017-05-02')..Date.parse('2017-05-02'),
244
286
  '12/1/2017' => Date.parse('2017-12-01')..Date.parse('2017-12-01'),
245
287
  '12/1/17' => Date.parse('2017-12-01')..Date.parse('2017-12-01'),
246
- '12/1/25' => Date.parse('1925-12-01')..Date.parse('1925-12-01')
288
+ '12/1/25' => Date.parse('1925-12-01')..Date.parse('1925-12-01'),
289
+ '6/18/938' => Date.parse('0938-06-18')..Date.parse('0938-06-18')
247
290
  }.each do |data, expected|
248
291
  describe "with #{data}" do
249
292
  let(:date_element) { "<dateCreated>#{data}</dateCreated>" }
@@ -269,7 +312,22 @@ RSpec.describe Mods::Date do
269
312
  'MDLXXVIII' => Date.parse('1578-01-01')..Date.parse('1578-12-31'),
270
313
  '[19--?]-' => Date.parse('1900-01-01')..Date.parse('1999-12-31'),
271
314
  '19th Century' => Date.parse('1800-01-01')..Date.parse('1899-12-31'),
272
- '19th c.' => Date.parse('1800-01-01')..Date.parse('1899-12-31')
315
+ '19th c.' => Date.parse('1800-01-01')..Date.parse('1899-12-31'),
316
+ 'mid to 2nd half of 13th century' => Date.parse('1200-01-01')..Date.parse('1299-12-31'),
317
+ '167-?]' => Date.parse('1670-01-01')..Date.parse('1679-12-31'),
318
+ '189-?' => Date.parse('1890-01-01')..Date.parse('1899-12-31'),
319
+ '193-' => Date.parse('1930-01-01')..Date.parse('1939-12-31'),
320
+ '196_' => Date.parse('1960-01-01')..Date.parse('1969-12-31'),
321
+ '196x' => Date.parse('1960-01-01')..Date.parse('1969-12-31'),
322
+ '186?' => Date.parse('1860-01-01')..Date.parse('1869-12-31'),
323
+ '1700?' => Date.parse('1700-01-01')..Date.parse('1700-12-31'),
324
+ '[1670-1684]' => Date.parse('1670-01-01')..Date.parse('1684-12-31'),
325
+ '[18]74' => Date.parse('1874-01-01')..Date.parse('1874-12-31'),
326
+ '250 B.C.' => Date.parse('-0249-01-01')..Date.parse('-249-12-31'),
327
+ 'Anno M.DC.LXXXI.' => Date.parse('1681-01-01')..Date.parse('1681-12-31'),
328
+ '624[1863 or 1864]' => Date.parse('1863-01-01')..Date.parse('1863-12-31'),
329
+ 'chez Villeneuve' => nil,
330
+ '‏4264681 או 368' => nil
273
331
  }.each do |data, expected|
274
332
  describe "with #{data}" do
275
333
  let(:date_element) { "<dateCreated>#{data}</dateCreated>" }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mods
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naomi Dushay
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-01 00:00:00.000000000 Z
12
+ date: 2017-12-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri