stanford-mods 2.6.4 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
- data/lib/stanford-mods/concerns/name.rb +57 -0
- data/lib/stanford-mods/concerns/origin_info.rb +113 -0
- data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
- data/lib/stanford-mods/concerns/searchworks.rb +125 -0
- data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
- data/lib/stanford-mods/concerns/title.rb +87 -0
- data/lib/stanford-mods/coordinate.rb +24 -3
- data/lib/stanford-mods/date_parsing.rb +32 -289
- data/lib/stanford-mods/imprint.rb +170 -322
- data/lib/stanford-mods/record.rb +20 -0
- data/lib/stanford-mods/version.rb +1 -1
- data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
- data/lib/stanford-mods.rb +12 -11
- data/spec/fixtures/searchworks_imprint_data.rb +38 -39
- data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
- data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
- data/spec/geo_spatial_spec.rb +1 -6
- data/spec/imprint_spec.rb +263 -207
- data/spec/lib/stanford-mods/coordinate_spec.rb +3 -5
- data/spec/name_spec.rb +26 -230
- data/spec/origin_info_spec.rb +34 -300
- data/spec/searchworks_basic_spec.rb +1 -3
- data/spec/searchworks_pub_dates_spec.rb +0 -215
- data/spec/searchworks_spec.rb +0 -21
- data/spec/searchworks_subject_raw_spec.rb +106 -105
- data/spec/searchworks_subject_spec.rb +19 -55
- data/spec/searchworks_title_spec.rb +5 -5
- data/stanford-mods.gemspec +1 -1
- metadata +19 -15
- data/lib/marc_countries.rb +0 -387
- data/lib/stanford-mods/geo_utils.rb +0 -28
- data/lib/stanford-mods/name.rb +0 -80
- data/lib/stanford-mods/origin_info.rb +0 -489
- data/lib/stanford-mods/searchworks.rb +0 -333
- data/lib/stanford-mods/searchworks_subjects.rb +0 -196
- data/spec/date_parsing_spec.rb +0 -905
@@ -1,102 +1,50 @@
|
|
1
|
-
require 'active_support/core_ext/integer/inflections'
|
2
|
-
|
3
1
|
module Stanford
|
4
2
|
module Mods
|
5
|
-
# Parsing date strings
|
6
|
-
# TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
|
7
|
-
# When this is "gemified":
|
8
|
-
# - we may want an integer or date sort field as well as lexical
|
9
|
-
# - we could add methods like my_date.bc?
|
10
3
|
class DateParsing
|
11
|
-
# get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
|
12
|
-
# @return [String, nil] display value for year if we could parse one, nil otherwise
|
13
|
-
def self.date_str_for_display(date_str)
|
14
|
-
DateParsing.new(date_str).date_str_for_display
|
15
|
-
end
|
16
|
-
|
17
|
-
# get year as Integer if we can parse date_str to get a year.
|
18
|
-
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
19
|
-
def self.year_int_from_date_str(date_str)
|
20
|
-
DateParsing.new(date_str).year_int_from_date_str
|
21
|
-
end
|
22
|
-
|
23
|
-
# get String sortable value year if we can parse date_str to get a year.
|
24
|
-
# SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
|
25
|
-
# The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
|
26
|
-
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
27
|
-
# note that these values must *lexically* sort to create a chronological sort.
|
28
|
-
def self.sortable_year_string_from_date_str(date_str)
|
29
|
-
DateParsing.new(date_str).sortable_year_string_from_date_str
|
30
|
-
end
|
31
|
-
|
32
|
-
# true if the year is between -999 and (current year + 1)
|
33
|
-
# @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
|
34
|
-
# @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
|
35
|
-
def self.year_str_valid?(year_str)
|
36
|
-
return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
|
37
|
-
|
38
|
-
(-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
|
39
|
-
end
|
40
|
-
|
41
4
|
# true if the year is between -9999 and (current year + 1)
|
42
5
|
# @return [Boolean] true if the year is between -9999 and (current year + 1); false otherwise
|
43
6
|
def self.year_int_valid?(year)
|
44
7
|
return false unless year.is_a? Integer
|
45
8
|
|
46
|
-
(
|
9
|
+
(year < Date.today.year + 2)
|
47
10
|
end
|
48
11
|
|
49
|
-
attr_reader :
|
12
|
+
attr_reader :xml
|
50
13
|
|
51
|
-
def initialize(
|
52
|
-
@
|
53
|
-
@orig_date_str.freeze
|
14
|
+
def initialize(xml)
|
15
|
+
@xml = xml
|
54
16
|
end
|
55
17
|
|
56
|
-
BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
|
57
|
-
|
58
18
|
# get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
|
59
19
|
# @return [String, nil] String value for year if we could parse one, nil otherwise
|
60
20
|
def date_str_for_display
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
21
|
+
date = xml&.as_object&.date
|
22
|
+
date = date.min || date.max if date.is_a?(EDTF::Epoch) || date.is_a?(EDTF::Interval)
|
23
|
+
|
24
|
+
return case xml.as_object.precision
|
25
|
+
when :century
|
26
|
+
return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
|
27
|
+
when :decade
|
28
|
+
return "#{date.year}s"
|
29
|
+
when :unknown
|
30
|
+
xml.text
|
31
|
+
else
|
32
|
+
if !self.class.year_int_valid? date.year
|
33
|
+
xml.text
|
34
|
+
elsif date.year < 1
|
35
|
+
"#{date.year.abs + 1} B.C."
|
36
|
+
elsif date.year < 1000
|
37
|
+
"#{date.year} A.D."
|
38
|
+
else
|
39
|
+
date.year.to_s
|
40
|
+
end
|
77
41
|
end
|
78
|
-
# remove leading 0s from early dates
|
79
|
-
result = "#{result.to_i} A.D." if result && result.match(/^0\d+$/)
|
80
|
-
result
|
81
42
|
end
|
82
43
|
|
83
44
|
# get Integer year if we can parse date_str to get a year.
|
84
45
|
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
85
46
|
def year_int_from_date_str
|
86
|
-
|
87
|
-
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
88
|
-
return sortable_year_int_for_bc if orig_date_str.match(BC_REGEX)
|
89
|
-
|
90
|
-
result = sortable_year_for_yyyy_or_yy
|
91
|
-
result ||= sortable_year_for_decade # 19xx or 20xx
|
92
|
-
result ||= sortable_year_for_century
|
93
|
-
result ||= sortable_year_int_for_early_numeric
|
94
|
-
unless result
|
95
|
-
# try removing brackets between digits in case we have 169[5] or [18]91
|
96
|
-
no_brackets = remove_brackets
|
97
|
-
return DateParsing.new(no_brackets).year_int_from_date_str if no_brackets
|
98
|
-
end
|
99
|
-
result.to_i if result && self.class.year_int_valid?(result.to_i)
|
47
|
+
xml&.as_object&.as_range&.first&.year
|
100
48
|
end
|
101
49
|
|
102
50
|
# get String sortable value year if we can parse date_str to get a year.
|
@@ -105,222 +53,17 @@ module Stanford
|
|
105
53
|
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
106
54
|
# note that these values must *lexically* sort to create a chronological sort.
|
107
55
|
def sortable_year_string_from_date_str
|
108
|
-
return
|
109
|
-
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
110
|
-
return sortable_year_str_for_bc if orig_date_str.match(BC_REGEX)
|
111
|
-
|
112
|
-
result = sortable_year_for_yyyy_or_yy
|
113
|
-
result ||= sortable_year_for_decade # 19xx or 20xx
|
114
|
-
result ||= sortable_year_for_century
|
115
|
-
result ||= sortable_year_str_for_early_numeric
|
116
|
-
unless result
|
117
|
-
# try removing brackets between digits in case we have 169[5] or [18]91
|
118
|
-
no_brackets = remove_brackets
|
119
|
-
return DateParsing.new(no_brackets).sortable_year_string_from_date_str if no_brackets
|
120
|
-
end
|
121
|
-
result if self.class.year_str_valid?(result)
|
122
|
-
end
|
123
|
-
|
124
|
-
# get String sortable value year if we can parse date_str to get a year.
|
125
|
-
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
126
|
-
# note that these values must *lexically* sort to create a chronological sort.
|
127
|
-
def sortable_year_for_yyyy_or_yy
|
128
|
-
# most date strings have a four digit year
|
129
|
-
result = sortable_year_for_yyyy
|
130
|
-
result ||= sortable_year_for_yy # 19xx or 20xx
|
131
|
-
result
|
132
|
-
end
|
133
|
-
|
134
|
-
# removes brackets between digits such as 169[5] or [18]91
|
135
|
-
def remove_brackets
|
136
|
-
orig_date_str.delete('[]') if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
|
137
|
-
end
|
138
|
-
|
139
|
-
# looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
|
140
|
-
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
|
141
|
-
def sortable_year_for_yyyy
|
142
|
-
matches = orig_date_str.match(/\d{4}/) if orig_date_str
|
143
|
-
matches.to_s if matches
|
144
|
-
end
|
56
|
+
return unless xml&.as_object&.date
|
145
57
|
|
146
|
-
|
147
|
-
# note that these are the only 2 digit year patterns found in our actual date strings in MODS records
|
148
|
-
# we use 20 as century digits unless it is greater than current year:
|
149
|
-
# 1/1/15 -> 2015
|
150
|
-
# 1/1/25 -> 1925
|
151
|
-
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
|
152
|
-
def sortable_year_for_yy
|
153
|
-
return unless orig_date_str
|
58
|
+
date = xml.as_object.date
|
154
59
|
|
155
|
-
|
156
|
-
|
157
|
-
|
60
|
+
if date.is_a?(EDTF::Interval) && date.from.year < 1
|
61
|
+
(-1 * date.from.year - 1000).to_s
|
62
|
+
elsif date.is_a?(Date) && date.year < 1
|
63
|
+
(-1 * date.year - 1000).to_s
|
158
64
|
else
|
159
|
-
|
160
|
-
date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
|
65
|
+
date.to_s[0..3]&.gsub('X', '-')
|
161
66
|
end
|
162
|
-
if date_obj && date_obj > Date.today
|
163
|
-
date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
|
164
|
-
end
|
165
|
-
date_obj.year.to_s if date_obj
|
166
|
-
rescue ArgumentError
|
167
|
-
nil # explicitly want nil if date won't parse
|
168
|
-
end
|
169
|
-
|
170
|
-
DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
|
171
|
-
|
172
|
-
# get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
173
|
-
# note that these are the only decade patterns found in our actual date strings in MODS records
|
174
|
-
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
|
175
|
-
def sortable_year_for_decade
|
176
|
-
decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
|
177
|
-
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
|
178
|
-
DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
|
179
|
-
end
|
180
|
-
|
181
|
-
DECADE_S_REGEXP = Regexp.new('\d{3}0\'?s')
|
182
|
-
|
183
|
-
# get, e.g. 1950s, if we have: yyyu, yyy-, yyy? or yyyx pattern or yyy0s or yyy0's
|
184
|
-
# note that these are the only decade patterns found in our actual date strings in MODS records
|
185
|
-
# @return [String, nil] 4 digit year with s (e.g. 1860s, 1950s) if orig_date_str matches pattern, nil otherwise
|
186
|
-
def display_str_for_decade
|
187
|
-
decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
|
188
|
-
if decade_matches
|
189
|
-
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
|
190
|
-
zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
|
191
|
-
return "#{zeroth_year}s" if zeroth_year
|
192
|
-
else
|
193
|
-
decade_matches = orig_date_str.match(DECADE_S_REGEXP) if orig_date_str
|
194
|
-
return decade_matches.to_s.tr("'", '') if decade_matches
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
|
199
|
-
CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
|
200
|
-
|
201
|
-
# get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
|
202
|
-
# note that these are the only century patterns found in our actual date strings in MODS records
|
203
|
-
# @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
204
|
-
def sortable_year_for_century
|
205
|
-
return unless orig_date_str
|
206
|
-
return if orig_date_str =~ /B\.C\./
|
207
|
-
|
208
|
-
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
209
|
-
if century_matches
|
210
|
-
return $1 + '00' if $1.length == 2
|
211
|
-
return '0' + $1 + '00' if $1.length == 1
|
212
|
-
end
|
213
|
-
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
214
|
-
if century_str_matches
|
215
|
-
yy = ($1.to_i - 1).to_s
|
216
|
-
return yy + '00' if yy.length == 2
|
217
|
-
return '0' + yy + '00' if yy.length == 1
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
# get display value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
|
222
|
-
# note that these are the only century patterns found in our actual date strings in MODS records
|
223
|
-
# @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
224
|
-
def display_str_for_century
|
225
|
-
return unless orig_date_str
|
226
|
-
return if orig_date_str =~ /B\.C\./
|
227
|
-
|
228
|
-
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
229
|
-
return century_str_matches.to_s if century_str_matches
|
230
|
-
|
231
|
-
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
232
|
-
if century_matches
|
233
|
-
return "#{($1.to_i + 1).ordinalize} century"
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
|
238
|
-
|
239
|
-
# get String sortable value for B.C. if we have B.C. pattern
|
240
|
-
# note that these values must *lexically* sort to create a chronological sort.
|
241
|
-
# We know our data does not contain B.C. dates older than 999, so we can make them
|
242
|
-
# lexically sort by subtracting 1000. So we get:
|
243
|
-
# -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
|
244
|
-
# @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
|
245
|
-
def sortable_year_str_for_bc
|
246
|
-
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
247
|
-
($1.to_i - 1000).to_s if bc_matches
|
248
|
-
end
|
249
|
-
|
250
|
-
# get Integer sortable value for B.C. if we have B.C. pattern
|
251
|
-
# @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
|
252
|
-
def sortable_year_int_for_bc
|
253
|
-
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
254
|
-
"-#{$1}".to_i if bc_matches
|
255
|
-
end
|
256
|
-
|
257
|
-
# get display value for B.C. if we have B.C. pattern
|
258
|
-
# @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
|
259
|
-
def display_str_for_bc
|
260
|
-
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
261
|
-
bc_matches.to_s if bc_matches
|
262
|
-
end
|
263
|
-
|
264
|
-
EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
|
265
|
-
|
266
|
-
# get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
|
267
|
-
# note that these values must *lexically* sort to create a chronological sort.
|
268
|
-
# We know our data does not contain negative dates older than -999, so we can make them
|
269
|
-
# lexically sort by subtracting 1000. So we get:
|
270
|
-
# -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
|
271
|
-
# @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
|
272
|
-
def sortable_year_str_for_early_numeric
|
273
|
-
return unless orig_date_str.match(EARLY_NUMERIC)
|
274
|
-
|
275
|
-
if orig_date_str =~ /^\-/
|
276
|
-
# negative number becomes x - 1000 for sorting; -005 for -995
|
277
|
-
num = orig_date_str[1..-1].to_i - 1000
|
278
|
-
return '-' + num.to_s[1..-1].rjust(3, '0')
|
279
|
-
else
|
280
|
-
return orig_date_str.rjust(4, '0')
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
# get Integer sortable value from date String containing yyy, yy, y, -y, -yy, -yyy, -yyyy
|
285
|
-
# @return [Integer, nil] Integer sortable -ddd if orig_date_str matches pattern; nil otherwise
|
286
|
-
def sortable_year_int_for_early_numeric
|
287
|
-
return orig_date_str.to_i if orig_date_str.match(EARLY_NUMERIC)
|
288
|
-
|
289
|
-
orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
|
290
|
-
end
|
291
|
-
|
292
|
-
# get display value for date String containing yyy, yy, y, -y, -yy, -yyy
|
293
|
-
# negative number strings will be changed to B.C. strings
|
294
|
-
# note that there is no year 0: from https://en.wikipedia.org/wiki/Anno_Domini
|
295
|
-
# "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
|
296
|
-
# There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
|
297
|
-
# See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
|
298
|
-
def display_str_for_early_numeric
|
299
|
-
return unless orig_date_str.match(EARLY_NUMERIC)
|
300
|
-
# return 1 B.C. when the date is 0 since there is no 0 year
|
301
|
-
return '1 B.C.' if orig_date_str == '0'
|
302
|
-
# negative number becomes B.C.
|
303
|
-
return "#{orig_date_str[1..-1].to_i + 1} B.C." if orig_date_str =~ /^\-/
|
304
|
-
|
305
|
-
# remove leading 0s from early dates
|
306
|
-
"#{orig_date_str.to_i} A.D."
|
307
|
-
end
|
308
|
-
|
309
|
-
# NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
|
310
|
-
# actually works for nearly all those cases and a lot more besides. Trial and error
|
311
|
-
# with an extensive set of test data culled from actual date strings in our MODS records
|
312
|
-
# has made this method bogus.
|
313
|
-
# @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
|
314
|
-
def year_via_ruby_parsing
|
315
|
-
return unless orig_date_str =~ /\d\d/ # need at least 2 digits
|
316
|
-
# need more in string than only 2 digits
|
317
|
-
return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
|
318
|
-
return if orig_date_str =~ /\d\s*B.C./ # skip B.C. dates
|
319
|
-
|
320
|
-
date_obj = Date.parse(orig_date_str)
|
321
|
-
date_obj.year.to_s
|
322
|
-
rescue ArgumentError
|
323
|
-
nil # explicitly want nil if date won't parse
|
324
67
|
end
|
325
68
|
end
|
326
69
|
end
|