cocina_display 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -10
- data/lib/cocina_display/cocina_record.rb +14 -1
- data/lib/cocina_display/concerns/events.rb +137 -0
- data/lib/cocina_display/dates/date.rb +688 -0
- data/lib/cocina_display/dates/date_range.rb +122 -0
- data/lib/cocina_display/imprint.rb +139 -0
- data/lib/cocina_display/marc_country_codes.rb +394 -0
- data/lib/cocina_display/version.rb +1 -1
- metadata +21 -2
@@ -0,0 +1,688 @@
|
|
1
|
+
require "edtf"
|
2
|
+
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext/object/blank"
|
5
|
+
require "active_support/core_ext/integer/inflections"
|
6
|
+
|
7
|
+
module CocinaDisplay
|
8
|
+
module Dates
|
9
|
+
# A date to be converted to a Date object.
|
10
|
+
class Date
|
11
|
+
# List of values that we shouldn't even attempt to parse.
|
12
|
+
UNPARSABLE_VALUES = ["0000-00-00", "9999", "uuuu", "[uuuu]"].freeze
|
13
|
+
|
14
|
+
# Construct a Date from parsed Cocina data.
|
15
|
+
# @param cocina [Hash] Cocina date data
|
16
|
+
# @return [CocinaDisplay::Date]
|
17
|
+
def self.from_cocina(cocina)
|
18
|
+
# Create a DateRange instead if structuredValue(s) are present
|
19
|
+
return DateRange.from_cocina(cocina) if cocina["structuredValue"].present?
|
20
|
+
|
21
|
+
# If an encoding was declared, use it. Cocina validates this
|
22
|
+
case cocina.dig("encoding", "code")
|
23
|
+
when "w3cdtf"
|
24
|
+
W3cdtfFormat.new(cocina)
|
25
|
+
when "iso8601"
|
26
|
+
Iso8601Format.new(cocina)
|
27
|
+
when "marc"
|
28
|
+
MarcFormat.new(cocina)
|
29
|
+
when "edtf"
|
30
|
+
EdtfFormat.new(cocina)
|
31
|
+
else # No declared encoding, or unknown encoding
|
32
|
+
value = cocina["value"]
|
33
|
+
|
34
|
+
# Don't bother with weird unparseable values
|
35
|
+
date_class = UnparseableDate if value =~ /\p{Hebrew}/ || value =~ /^-/
|
36
|
+
|
37
|
+
# Try to match against known date formats using their regexes
|
38
|
+
# Order matters here; more specific formats should be checked first
|
39
|
+
date_class ||= [
|
40
|
+
MMDDYYYYFormat,
|
41
|
+
MMDDYYFormat,
|
42
|
+
YearRangeFormat,
|
43
|
+
DecadeAsYearDashFormat,
|
44
|
+
DecadeStringFormat,
|
45
|
+
EmbeddedBCYearFormat,
|
46
|
+
EmbeddedYearFormat,
|
47
|
+
EmbeddedThreeDigitYearFormat,
|
48
|
+
EmbeddedYearWithBracketsFormat,
|
49
|
+
MysteryCenturyFormat,
|
50
|
+
CenturyFormat,
|
51
|
+
RomanNumeralCenturyFormat,
|
52
|
+
RomanNumeralYearFormat,
|
53
|
+
OneOrTwoDigitYearFormat
|
54
|
+
].find { |klass| klass.supports?(value) }
|
55
|
+
|
56
|
+
# If no specific format matched, use the base class
|
57
|
+
date_class ||= CocinaDisplay::Dates::Date
|
58
|
+
|
59
|
+
date_class.new(cocina)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Parse a string to a Date object according to the given encoding.
|
64
|
+
# Delegates to the parser subclass {normalize_to_edtf} method.
|
65
|
+
# @param value [String] the date value to parse
|
66
|
+
# @return [Date]
|
67
|
+
# @return [nil] if the date is blank or invalid
|
68
|
+
def self.parse_date(value)
|
69
|
+
::Date.edtf(normalize_to_edtf(value))
|
70
|
+
end
|
71
|
+
|
72
|
+
# Apply any encoding-specific munging or text extraction logic.
|
73
|
+
# @note This is the "fallback" version when no other parser matches.
|
74
|
+
# @param value [String] the date value to modify
|
75
|
+
# @return [String]
|
76
|
+
def self.normalize_to_edtf(value)
|
77
|
+
sanitized = value.gsub(/^[\[]+/, "").gsub(/[\.\]]+$/, "")
|
78
|
+
sanitized = value.rjust(4, "0") if /^\d{3}$/.match?(value)
|
79
|
+
|
80
|
+
sanitized
|
81
|
+
end
|
82
|
+
|
83
|
+
attr_reader :cocina, :date
|
84
|
+
|
85
|
+
def initialize(cocina)
|
86
|
+
@cocina = cocina
|
87
|
+
@date = self.class.parse_date(cocina["value"])
|
88
|
+
end
|
89
|
+
|
90
|
+
# Compare this date to another {Date} or {DateRange} using its {sort_key}.
|
91
|
+
def <=>(other)
|
92
|
+
sort_key <=> other.sort_key if other.is_a?(Date) || other.is_a?(DateRange)
|
93
|
+
end
|
94
|
+
|
95
|
+
# The text representation of the date, as stored in Cocina.
|
96
|
+
# @return [String]
|
97
|
+
def value
|
98
|
+
cocina["value"]
|
99
|
+
end
|
100
|
+
|
101
|
+
# The type of this date, if any, such as "creation", "publication", etc.
|
102
|
+
# @return [String, nil]
|
103
|
+
def type
|
104
|
+
cocina["type"]
|
105
|
+
end
|
106
|
+
|
107
|
+
# The qualifier for this date, if any, such as "approximate", "inferred", etc.
|
108
|
+
# @return [String, nil]
|
109
|
+
def qualifier
|
110
|
+
cocina["qualifier"]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Does this date have a qualifier? E.g. "approximate", "inferred", etc.
|
114
|
+
# @return [Boolean]
|
115
|
+
def qualified?
|
116
|
+
qualifier.present?
|
117
|
+
end
|
118
|
+
|
119
|
+
# The encoding of this date, if specified.
|
120
|
+
# @example
|
121
|
+
# date.encoding #=> "iso8601"
|
122
|
+
# @return [String, nil]
|
123
|
+
def encoding
|
124
|
+
cocina.dig("encoding", "code")
|
125
|
+
end
|
126
|
+
|
127
|
+
# Was an encoding declared for this date?
|
128
|
+
# @return [Boolean]
|
129
|
+
def encoding?
|
130
|
+
encoding.present?
|
131
|
+
end
|
132
|
+
|
133
|
+
# Is this the start date in a range?
|
134
|
+
# @return [Boolean]
|
135
|
+
def start?
|
136
|
+
type == "start"
|
137
|
+
end
|
138
|
+
|
139
|
+
# Is this the end date in a range?
|
140
|
+
# @return [Boolean]
|
141
|
+
def end?
|
142
|
+
type == "end"
|
143
|
+
end
|
144
|
+
|
145
|
+
# Was the date marked as approximate?
|
146
|
+
# @return [Boolean]
|
147
|
+
def approximate?
|
148
|
+
qualifier == "approximate"
|
149
|
+
end
|
150
|
+
|
151
|
+
# Was the date marked as inferred?
|
152
|
+
# @return [Boolean]
|
153
|
+
def inferred?
|
154
|
+
qualifier == "inferred"
|
155
|
+
end
|
156
|
+
|
157
|
+
# Was the date marked as approximate?
|
158
|
+
# @return [Boolean]
|
159
|
+
def questionable?
|
160
|
+
qualifier == "questionable"
|
161
|
+
end
|
162
|
+
|
163
|
+
# Was the date marked as primary?
|
164
|
+
# @note In MODS XML, this corresponds to the +keyDate+ attribute.
|
165
|
+
# @return [Boolean]
|
166
|
+
def primary?
|
167
|
+
cocina["status"] == "primary"
|
168
|
+
end
|
169
|
+
|
170
|
+
# Is the value present and not a known unparsable value like "9999"?
|
171
|
+
# @return [Boolean]
|
172
|
+
def parsable?
|
173
|
+
value.present? && !UNPARSABLE_VALUES.include?(value)
|
174
|
+
end
|
175
|
+
|
176
|
+
# Did we successfully parse a date from the Cocina data?
|
177
|
+
# @return [Boolean]
|
178
|
+
def parsed_date?
|
179
|
+
date.present?
|
180
|
+
end
|
181
|
+
|
182
|
+
# How precise is the parsed date information?
|
183
|
+
# @return [Symbol] :year, :month, :day, :decade, :century, or :unknown
|
184
|
+
def precision
|
185
|
+
return :unknown unless date_range || date
|
186
|
+
|
187
|
+
if date_range.is_a? EDTF::Century
|
188
|
+
:century
|
189
|
+
elsif date_range.is_a? EDTF::Decade
|
190
|
+
:decade
|
191
|
+
elsif date.is_a? EDTF::Season
|
192
|
+
:month
|
193
|
+
elsif date.is_a? EDTF::Interval
|
194
|
+
date.precision
|
195
|
+
else
|
196
|
+
case date.precision
|
197
|
+
when :month
|
198
|
+
date.unspecified.unspecified?(:month) ? :year : :month
|
199
|
+
when :day
|
200
|
+
d = date.unspecified.unspecified?(:day) ? :month : :day
|
201
|
+
date.unspecified.unspecified?(:month) ? :year : d
|
202
|
+
else
|
203
|
+
date.precision
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# Used to sort BCE dates correctly in lexicographic order.
|
209
|
+
BCE_CHAR_SORT_MAP = {"0" => "9", "1" => "8", "2" => "7", "3" => "6", "4" => "5", "5" => "4", "6" => "3", "7" => "2", "8" => "1", "9" => "0"}.freeze
|
210
|
+
|
211
|
+
# Key used to sort this date. Respects BCE/CE ordering and precision.
|
212
|
+
# @return [String]
|
213
|
+
def sort_key
|
214
|
+
# Even if not parsed, we might need to sort it for display later
|
215
|
+
return "" unless parsed_date?
|
216
|
+
|
217
|
+
# Use the start of an interval for sorting
|
218
|
+
sort_date = date.is_a?(EDTF::Interval) ? date.from : date
|
219
|
+
|
220
|
+
# Get the parsed year, month, and day values
|
221
|
+
year, month, day = if sort_date.respond_to?(:values)
|
222
|
+
sort_date.values
|
223
|
+
else
|
224
|
+
[sort_date.year, nil, nil]
|
225
|
+
end
|
226
|
+
|
227
|
+
# Format year into sortable string
|
228
|
+
year_str = if year > 0
|
229
|
+
# for CE dates, we can just pad them out to 4 digits and sort normally...
|
230
|
+
year.to_s.rjust(4, "0")
|
231
|
+
else
|
232
|
+
# ... but for BCE, because we're sorting lexically, we need to invert the digits (replacing 0 with 9, 1 with 8, etc.),
|
233
|
+
# we prefix it with a hyphen (which will sort before any digit) and the number of digits (also inverted) to get
|
234
|
+
# it to sort correctly.
|
235
|
+
inverted_year = year.abs.to_s.chars.map { |c| BCE_CHAR_SORT_MAP[c] }.join
|
236
|
+
length_prefix = BCE_CHAR_SORT_MAP[inverted_year.to_s.length.to_s]
|
237
|
+
"-#{length_prefix}#{inverted_year}"
|
238
|
+
end
|
239
|
+
|
240
|
+
# Format month and day into sortable strings, pad to 2 digits
|
241
|
+
month_str = month ? month.to_s.rjust(2, "0") : "00"
|
242
|
+
day_str = day ? day.to_s.rjust(2, "0") : "00"
|
243
|
+
|
244
|
+
# Join into a sortable string; add hyphens so decade/century sort first
|
245
|
+
case precision
|
246
|
+
when :decade
|
247
|
+
[year_str[0...-1], "-", month_str, day_str].join
|
248
|
+
when :century
|
249
|
+
[year_str[0...-2], "--", month_str, day_str].join
|
250
|
+
else
|
251
|
+
[year_str, month_str, day_str].join
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
# Value reduced to digits and hyphen. Used for comparison/deduping.
|
256
|
+
# @note This is important for uniqueness checks in Imprint display.
|
257
|
+
# @return [String]
|
258
|
+
def base_value
|
259
|
+
if value =~ /^\[?1\d{3}-\d{2}\??\]?$/
|
260
|
+
return value.sub(/(\d{2})(\d{2})-(\d{2})/, '\1\2-\1\3')
|
261
|
+
end
|
262
|
+
|
263
|
+
value.gsub(/(?<![\d])(\d{1,3})([xu-]{1,3})/i) { "#{Regexp.last_match(1)}#{"0" * Regexp.last_match(2).length}" }.scan(/[\d-]/).join
|
264
|
+
end
|
265
|
+
|
266
|
+
# Decoded version of the date with "BCE" or "CE". Strips leading zeroes.
|
267
|
+
# @param allowed_precisions [Array<Symbol>] List of allowed precisions for the output.
|
268
|
+
# Defaults to [:day, :month, :year, :decade, :century].
|
269
|
+
# @param ignore_unparseable [Boolean] Return nil instead of the original value if it couldn't be parsed
|
270
|
+
# @param display_original_value [Boolean] Return the original value if it was not encoded
|
271
|
+
# @return [String]
|
272
|
+
def decoded_value(allowed_precisions: [:day, :month, :year, :decade, :century], ignore_unparseable: false, display_original_value: true)
|
273
|
+
return if ignore_unparseable && !parsed_date?
|
274
|
+
return value.strip unless parsed_date?
|
275
|
+
|
276
|
+
if display_original_value
|
277
|
+
unless encoding?
|
278
|
+
return value.strip unless value =~ /^-?\d+$/ || value =~ /^[\dXxu?-]{4}$/
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
if date.is_a?(EDTF::Interval)
|
283
|
+
range = [
|
284
|
+
Date.format_date(date.min, date.min.precision, allowed_precisions),
|
285
|
+
Date.format_date(date.max, date.max.precision, allowed_precisions)
|
286
|
+
].uniq.compact
|
287
|
+
|
288
|
+
return value.strip if range.empty?
|
289
|
+
|
290
|
+
range.join(" - ")
|
291
|
+
else
|
292
|
+
Date.format_date(date, precision, allowed_precisions) || value.strip
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
# Decoded date with "BCE" or "CE" and qualifier markers applied.
|
297
|
+
# @see decoded_value
|
298
|
+
# @see https://consul.stanford.edu/display/chimera/MODS+display+rules#MODSdisplayrules-3b.%3CoriginInfo%3E
|
299
|
+
def qualified_value
|
300
|
+
qualified_format = case qualifier
|
301
|
+
when "approximate"
|
302
|
+
"[ca. %s]"
|
303
|
+
when "questionable"
|
304
|
+
"[%s?]"
|
305
|
+
when "inferred"
|
306
|
+
"[%s]"
|
307
|
+
else
|
308
|
+
"%s"
|
309
|
+
end
|
310
|
+
|
311
|
+
format(qualified_format, decoded_value)
|
312
|
+
end
|
313
|
+
|
314
|
+
# Range between earliest possible date and latest possible date.
|
315
|
+
# @note Some encodings support disjoint sets of ranges, so this method could be less accurate than {#to_a}.
|
316
|
+
# @return [Range]
|
317
|
+
def as_range
|
318
|
+
return unless earliest_date && latest_date
|
319
|
+
|
320
|
+
earliest_date..latest_date
|
321
|
+
end
|
322
|
+
|
323
|
+
# Array of all dates that fall into the range of possible dates in the data.
|
324
|
+
# @note Some encodings support disjoint sets of ranges, so this method could be more accurate than {#as_range}.
|
325
|
+
# @return [Array]
|
326
|
+
def to_a
|
327
|
+
case date
|
328
|
+
when EDTF::Set
|
329
|
+
date.to_a
|
330
|
+
else
|
331
|
+
as_range.to_a
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
private
|
336
|
+
|
337
|
+
class << self
|
338
|
+
# Returns the date in the format specified by the precision.
|
339
|
+
# Supports e.g. retrieving year precision when the actual date is more precise.
|
340
|
+
# @param date [Date] The date to format.
|
341
|
+
# @param precision [Symbol] The precision to format the date at, e.g. :month
|
342
|
+
# @param allowed_precisions [Array<Symbol>] List of allowed precisions for the output.
|
343
|
+
# Options are [:day, :month, :year, :decade, :century].
|
344
|
+
# @note allowed_precisions should be ordered by granularity, with most specific first.
|
345
|
+
def format_date(date, precision, allowed_precisions)
|
346
|
+
precision = allowed_precisions.first unless allowed_precisions.include?(precision)
|
347
|
+
|
348
|
+
case precision
|
349
|
+
when :day
|
350
|
+
date.strftime("%B %e, %Y")
|
351
|
+
when :month
|
352
|
+
date.strftime("%B %Y")
|
353
|
+
when :year
|
354
|
+
year = date.year
|
355
|
+
if year < 1
|
356
|
+
"#{year.abs + 1} BCE"
|
357
|
+
# Any dates before the year 1000 are explicitly marked CE
|
358
|
+
elsif year >= 1 && year < 1000
|
359
|
+
"#{year} CE"
|
360
|
+
else
|
361
|
+
year.to_s
|
362
|
+
end
|
363
|
+
when :decade
|
364
|
+
"#{EDTF::Decade.new(date.year).year}s"
|
365
|
+
when :century
|
366
|
+
if date.year.negative?
|
367
|
+
"#{((date.year / 100).abs + 1).ordinalize} century BCE"
|
368
|
+
else
|
369
|
+
"#{((date.year / 100) + 1).ordinalize} century"
|
370
|
+
end
|
371
|
+
end
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
# Earliest possible date encoded in data, respecting unspecified/imprecise info.
|
376
|
+
# @return [Date]
|
377
|
+
def earliest_date
|
378
|
+
return nil if date.nil?
|
379
|
+
|
380
|
+
case date_range
|
381
|
+
when EDTF::Unknown
|
382
|
+
nil
|
383
|
+
when EDTF::Epoch, EDTF::Interval, EDTF::Season
|
384
|
+
date_range.min
|
385
|
+
when EDTF::Set
|
386
|
+
date_range.to_a.first
|
387
|
+
else
|
388
|
+
d = date.dup
|
389
|
+
d = d.change(month: 1, day: 1) if date.precision == :year
|
390
|
+
d = d.change(day: 1) if date.precision == :month
|
391
|
+
d = d.change(month: 1) if date.unspecified.unspecified? :month
|
392
|
+
d = d.change(day: 1) if date.unspecified.unspecified? :day
|
393
|
+
d
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
# Latest possible date encoded in data, respecting unspecified/imprecise info.
|
398
|
+
# @return [Date]
|
399
|
+
def latest_date
|
400
|
+
return nil if date.nil?
|
401
|
+
|
402
|
+
case date_range
|
403
|
+
when EDTF::Unknown
|
404
|
+
nil
|
405
|
+
when EDTF::Epoch, EDTF::Interval, EDTF::Season
|
406
|
+
date_range.max
|
407
|
+
when EDTF::Set
|
408
|
+
date_range.to_a.last.change(month: 12, day: 31)
|
409
|
+
else
|
410
|
+
d = date.dup
|
411
|
+
d = d.change(month: 12, day: 31) if date.precision == :year
|
412
|
+
d = d.change(day: days_in_month(date.month, date.year)) if date.precision == :month
|
413
|
+
d = d.change(month: 12) if date.unspecified.unspecified? :month
|
414
|
+
d = d.change(day: days_in_month(date.month, date.year)) if date.unspecified.unspecified? :day
|
415
|
+
d
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
# Expand placeholders like "19XX" into an object representing the full range.
|
420
|
+
# @note This is different from dates with an explicit start/end in the Cocina.
|
421
|
+
# @see CocinaDisplay::Dates::DateRange
|
422
|
+
# @return [Date]
|
423
|
+
def date_range
|
424
|
+
@date_range ||= if /u/.match?(value)
|
425
|
+
::Date.edtf(value.tr("u", "x").tr("X", "x")) || date
|
426
|
+
else
|
427
|
+
date
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# Helper for calculating days in a month, accounting for leap years.
|
432
|
+
# @param [Integer] month
|
433
|
+
# @param [Integer] year
|
434
|
+
# @return [Integer] Number of days in the month
|
435
|
+
def days_in_month(month, year)
|
436
|
+
if month == 2 && ::Date.gregorian_leap?(year)
|
437
|
+
29
|
438
|
+
else
|
439
|
+
[nil, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31][month]
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
# Strict ISO8601-encoded date parser.
|
445
|
+
class Iso8601Format < Date
|
446
|
+
def self.parse_date(value)
|
447
|
+
::Date.parse(normalize_to_edtf(value))
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
# Less strict W3CDTF-encoded date parser.
|
452
|
+
class W3cdtfFormat < Date
|
453
|
+
def self.normalize_to_edtf(value)
|
454
|
+
super.gsub("-00", "")
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
# Strict EDTF parser.
|
459
|
+
class EdtfFormat < Date
|
460
|
+
attr_reader :date
|
461
|
+
|
462
|
+
def self.normalize_to_edtf(value)
|
463
|
+
return "0000" if value.strip == "0"
|
464
|
+
|
465
|
+
case value
|
466
|
+
when /^\d{1,3}$/
|
467
|
+
value.rjust(4, "0") if /^\d{1,3}$/.match?(value)
|
468
|
+
when /^-\d{1,3}$/
|
469
|
+
"-#{value.sub(/^-/, "").rjust(4, "0")}"
|
470
|
+
else
|
471
|
+
value
|
472
|
+
end
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
# MARC date parser; similar to EDTF but with some MARC-specific encodings.
|
477
|
+
class MarcFormat < Date
|
478
|
+
def self.normalize_to_edtf(value)
|
479
|
+
return nil if value == "9999" || value == "uuuu" || value == "||||"
|
480
|
+
|
481
|
+
super
|
482
|
+
end
|
483
|
+
|
484
|
+
private
|
485
|
+
|
486
|
+
def earliest_date
|
487
|
+
if value == "1uuu"
|
488
|
+
::Date.parse("1000-01-01")
|
489
|
+
else
|
490
|
+
super
|
491
|
+
end
|
492
|
+
end
|
493
|
+
|
494
|
+
def latest_date
|
495
|
+
if value == "1uuu"
|
496
|
+
::Date.parse("1999-12-31")
|
497
|
+
else
|
498
|
+
super
|
499
|
+
end
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
# Base class for date formats that match using a regex.
|
504
|
+
class ExtractorDateFormat < Date
|
505
|
+
def self.supports?(value)
|
506
|
+
value.match self::REGEX
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
510
|
+
# A date format that cannot be parsed or recognized.
|
511
|
+
class UnparseableDate < ExtractorDateFormat
|
512
|
+
def self.parse_date(value)
|
513
|
+
nil
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
# Extractor for MM/DD/YYYY and MM/DD/YYY-formatted dates
|
518
|
+
class MMDDYYYYFormat < ExtractorDateFormat
|
519
|
+
REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{3,4})/
|
520
|
+
|
521
|
+
def self.normalize_to_edtf(value)
|
522
|
+
matches = value.match(self::REGEX)
|
523
|
+
"#{matches[:year].rjust(4, "0")}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
|
524
|
+
end
|
525
|
+
end
|
526
|
+
|
527
|
+
# Extractor for MM/DD/YY-formatted dates
|
528
|
+
class MMDDYYFormat < ExtractorDateFormat
|
529
|
+
REGEX = /(?<month>\d{1,2})\/(?<day>\d{1,2})\/(?<year>\d{2})/
|
530
|
+
|
531
|
+
def self.normalize_to_edtf(value)
|
532
|
+
matches = value.match(self::REGEX)
|
533
|
+
year = munge_to_yyyy(matches[:year])
|
534
|
+
"#{year}-#{matches[:month].rjust(2, "0")}-#{matches[:day].rjust(2, "0")}"
|
535
|
+
end
|
536
|
+
|
537
|
+
# For two-digit year, if it would be in the future, more likely to just
|
538
|
+
# be the previous century. 12/1/99 -> 1999
|
539
|
+
def self.munge_to_yyyy(year)
|
540
|
+
if year.to_i > (::Date.current.year - 2000)
|
541
|
+
"19#{year}"
|
542
|
+
else
|
543
|
+
"20#{year}"
|
544
|
+
end
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
# Extractor for dates encoded as Roman numerals.
|
549
|
+
class RomanNumeralYearFormat < ExtractorDateFormat
|
550
|
+
REGEX = /(?<![A-Za-z\.])(?<year>[MCDLXVI\.]+)(?![A-Za-z])/
|
551
|
+
|
552
|
+
def self.normalize_to_edtf(text)
|
553
|
+
matches = text.match(REGEX)
|
554
|
+
roman_to_int(matches[:year].upcase).to_s
|
555
|
+
end
|
556
|
+
|
557
|
+
def self.roman_to_int(value)
|
558
|
+
value = value.tr(".", "")
|
559
|
+
map = {"M" => 1000, "CM" => 900, "D" => 500, "CD" => 400, "C" => 100, "XC" => 90, "L" => 50, "XL" => 40, "X" => 10, "IX" => 9, "V" => 5, "IV" => 4, "I" => 1}
|
560
|
+
result = 0
|
561
|
+
map.each do |k, v|
|
562
|
+
while value.index(k) == 0
|
563
|
+
result += v
|
564
|
+
value.slice! k
|
565
|
+
end
|
566
|
+
end
|
567
|
+
result
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
# Extractor for centuries encoded as Roman numerals; sometimes centuries
|
572
|
+
# are given as e.g. xvith, hence the funny negative look-ahead assertion
|
573
|
+
class RomanNumeralCenturyFormat < RomanNumeralYearFormat
|
574
|
+
REGEX = /(?<![a-z])(?<century>[xvi]+)(?![a-su-z])/
|
575
|
+
|
576
|
+
def self.normalize_to_edtf(text)
|
577
|
+
matches = text.match(REGEX)
|
578
|
+
munge_to_yyyy(matches[:century])
|
579
|
+
end
|
580
|
+
|
581
|
+
def self.munge_to_yyyy(text)
|
582
|
+
value = roman_to_int(text.upcase)
|
583
|
+
(value - 1).to_s.rjust(2, "0") + "xx"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
# Extractor for a flavor of century encoding present in Stanford data
|
588
|
+
# of unknown origin.
|
589
|
+
class MysteryCenturyFormat < ExtractorDateFormat
|
590
|
+
REGEX = /(?<century>\d{2})--/
|
591
|
+
def self.normalize_to_edtf(text)
|
592
|
+
matches = text.match(REGEX)
|
593
|
+
"#{matches[:century]}xx"
|
594
|
+
end
|
595
|
+
end
|
596
|
+
|
597
|
+
# Extractor for dates given as centuries
|
598
|
+
class CenturyFormat < ExtractorDateFormat
|
599
|
+
REGEX = /(?<century>\d{2})th C(?:entury)?/i
|
600
|
+
|
601
|
+
def self.normalize_to_edtf(text)
|
602
|
+
matches = text.match(REGEX)
|
603
|
+
"#{matches[:century].to_i - 1}xx"
|
604
|
+
end
|
605
|
+
end
|
606
|
+
|
607
|
+
# Extractor for data formatted as YYYY-YYYY or YYY-YYY
|
608
|
+
class YearRangeFormat < ExtractorDateFormat
|
609
|
+
REGEX = /(?<start>\d{3,4})-(?<end>\d{3,4})/
|
610
|
+
|
611
|
+
def self.normalize_to_edtf(text)
|
612
|
+
matches = text.match(REGEX)
|
613
|
+
"#{matches[:start].rjust(4, "0")}/#{matches[:end].rjust(4, "0")}"
|
614
|
+
end
|
615
|
+
end
|
616
|
+
|
617
|
+
# Extractor for data formatted as YYY-
|
618
|
+
class DecadeAsYearDashFormat < ExtractorDateFormat
|
619
|
+
REGEX = /(?<!\d)(?<year>\d{3})[-_xu?](?!\d)/
|
620
|
+
|
621
|
+
def self.normalize_to_edtf(text)
|
622
|
+
matches = text.match(REGEX)
|
623
|
+
"#{matches[:year]}x"
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# Extractor for data formatted as YYY0s
|
628
|
+
class DecadeStringFormat < ExtractorDateFormat
|
629
|
+
REGEX = /(?<!\d)(?<year>\d{3})0s(?!\d)/
|
630
|
+
|
631
|
+
def self.normalize_to_edtf(text)
|
632
|
+
matches = text.match(REGEX)
|
633
|
+
"#{matches[:year]}x"
|
634
|
+
end
|
635
|
+
end
|
636
|
+
|
637
|
+
# Extractor that tries hard to pick any BC year present in the data
|
638
|
+
class EmbeddedBCYearFormat < ExtractorDateFormat
|
639
|
+
REGEX = /(?<year>\d{3,4})\s?B\.?C\.?/i
|
640
|
+
|
641
|
+
def self.normalize_to_edtf(text)
|
642
|
+
matches = text.match(REGEX)
|
643
|
+
"-#{(matches[:year].to_i - 1).to_s.rjust(4, "0")}"
|
644
|
+
end
|
645
|
+
end
|
646
|
+
|
647
|
+
# Extractor that tries hard to pick any year present in the data
|
648
|
+
class EmbeddedYearFormat < ExtractorDateFormat
|
649
|
+
REGEX = /(?<!\d)(?<year>\d{4})(?!\d)/
|
650
|
+
|
651
|
+
def self.normalize_to_edtf(text)
|
652
|
+
matches = text.match(REGEX)
|
653
|
+
matches[:year].rjust(4, "0")
|
654
|
+
end
|
655
|
+
end
|
656
|
+
|
657
|
+
# Extractor that tries hard to pick any 3-digit year present in the data
|
658
|
+
class EmbeddedThreeDigitYearFormat < ExtractorDateFormat
|
659
|
+
REGEX = /(?<!\d)(?<year>\d{3})(?!\d)(?!\d)/
|
660
|
+
|
661
|
+
def self.normalize_to_edtf(text)
|
662
|
+
matches = text.match(REGEX)
|
663
|
+
matches[:year].rjust(4, "0")
|
664
|
+
end
|
665
|
+
end
|
666
|
+
|
667
|
+
# Extractor that tries hard to pick any 1- or 2-digit year present in the data
|
668
|
+
class OneOrTwoDigitYearFormat < ExtractorDateFormat
|
669
|
+
REGEX = /^(?<year>\d{1,2})$/
|
670
|
+
|
671
|
+
def self.normalize_to_edtf(text)
|
672
|
+
matches = text.match(REGEX)
|
673
|
+
matches[:year].rjust(4, "0")
|
674
|
+
end
|
675
|
+
end
|
676
|
+
|
677
|
+
# Full-text extractor that tries hard to pick any bracketed year present in the data
|
678
|
+
class EmbeddedYearWithBracketsFormat < ExtractorDateFormat
|
679
|
+
# [YYY]Y Y[YYY] [YY]YY Y[YY]Y YY[YY] YYY[Y] YY[Y]Y Y[Y]YY [Y]YYY
|
680
|
+
REGEX = /(?<year>[\d\[\]]{6})(?!\d)/
|
681
|
+
|
682
|
+
def self.normalize_to_edtf(text)
|
683
|
+
matches = text.match(REGEX)
|
684
|
+
matches[:year].delete("[").delete("]")
|
685
|
+
end
|
686
|
+
end
|
687
|
+
end
|
688
|
+
end
|