stanford-mods 2.6.4 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +1 -1
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +24 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -289
  12. data/lib/stanford-mods/imprint.rb +170 -322
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
  16. data/lib/stanford-mods.rb +12 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +263 -207
  22. data/spec/lib/stanford-mods/coordinate_spec.rb +3 -5
  23. data/spec/name_spec.rb +26 -230
  24. data/spec/origin_info_spec.rb +34 -300
  25. data/spec/searchworks_basic_spec.rb +1 -3
  26. data/spec/searchworks_pub_dates_spec.rb +0 -215
  27. data/spec/searchworks_spec.rb +0 -21
  28. data/spec/searchworks_subject_raw_spec.rb +106 -105
  29. data/spec/searchworks_subject_spec.rb +19 -55
  30. data/spec/searchworks_title_spec.rb +5 -5
  31. data/stanford-mods.gemspec +1 -1
  32. metadata +19 -15
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,102 +1,50 @@
1
- require 'active_support/core_ext/integer/inflections'
2
-
3
1
  module Stanford
4
2
  module Mods
5
- # Parsing date strings
6
- # TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
7
- # When this is "gemified":
8
- # - we may want an integer or date sort field as well as lexical
9
- # - we could add methods like my_date.bc?
10
3
  class DateParsing
11
- # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
12
- # @return [String, nil] display value for year if we could parse one, nil otherwise
13
- def self.date_str_for_display(date_str)
14
- DateParsing.new(date_str).date_str_for_display
15
- end
16
-
17
- # get year as Integer if we can parse date_str to get a year.
18
- # @return [Integer, nil] Integer year if we could parse one, nil otherwise
19
- def self.year_int_from_date_str(date_str)
20
- DateParsing.new(date_str).year_int_from_date_str
21
- end
22
-
23
- # get String sortable value year if we can parse date_str to get a year.
24
- # SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
25
- # The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
26
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
27
- # note that these values must *lexically* sort to create a chronological sort.
28
- def self.sortable_year_string_from_date_str(date_str)
29
- DateParsing.new(date_str).sortable_year_string_from_date_str
30
- end
31
-
32
- # true if the year is between -999 and (current year + 1)
33
- # @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
34
- # @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
35
- def self.year_str_valid?(year_str)
36
- return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
37
-
38
- (-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
39
- end
40
-
41
4
  # true if the year is between -9999 and (current year + 1)
42
5
  # @return [Boolean] true if the year is between -9999 and (current year + 1); false otherwise
43
6
  def self.year_int_valid?(year)
44
7
  return false unless year.is_a? Integer
45
8
 
46
- (-1000 < year.to_i) && (year < Date.today.year + 2)
9
+ (year < Date.today.year + 2)
47
10
  end
48
11
 
49
- attr_reader :orig_date_str
12
+ attr_reader :xml
50
13
 
51
- def initialize(date_str)
52
- @orig_date_str = date_str
53
- @orig_date_str.freeze
14
+ def initialize(xml)
15
+ @xml = xml
54
16
  end
55
17
 
56
- BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
57
-
58
18
  # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
59
19
  # @return [String, nil] String value for year if we could parse one, nil otherwise
60
20
  def date_str_for_display
61
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
62
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
63
- return display_str_for_bc if orig_date_str.match(BC_REGEX)
64
- # decade next in case there are 4 digits, e.g. 1950s
65
- return display_str_for_decade if orig_date_str.match(DECADE_4CHAR_REGEXP) || orig_date_str.match(DECADE_S_REGEXP)
66
-
67
- result = sortable_year_for_yyyy_or_yy
68
- unless result
69
- # try removing brackets between digits in case we have 169[5] or [18]91
70
- no_brackets = remove_brackets
71
- return DateParsing.new(no_brackets).date_str_for_display if no_brackets
72
- end
73
- # parsing below this line gives string inapprop for year_str_valid?
74
- unless self.class.year_str_valid?(result)
75
- result = display_str_for_century
76
- result ||= display_str_for_early_numeric
21
+ date = xml&.as_object&.date
22
+ date = date.min || date.max if date.is_a?(EDTF::Epoch) || date.is_a?(EDTF::Interval)
23
+
24
+ return case xml.as_object.precision
25
+ when :century
26
+ return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
27
+ when :decade
28
+ return "#{date.year}s"
29
+ when :unknown
30
+ xml.text
31
+ else
32
+ if !self.class.year_int_valid? date.year
33
+ xml.text
34
+ elsif date.year < 1
35
+ "#{date.year.abs + 1} B.C."
36
+ elsif date.year < 1000
37
+ "#{date.year} A.D."
38
+ else
39
+ date.year.to_s
40
+ end
77
41
  end
78
- # remove leading 0s from early dates
79
- result = "#{result.to_i} A.D." if result && result.match(/^0\d+$/)
80
- result
81
42
  end
82
43
 
83
44
  # get Integer year if we can parse date_str to get a year.
84
45
  # @return [Integer, nil] Integer year if we could parse one, nil otherwise
85
46
  def year_int_from_date_str
86
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
87
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
88
- return sortable_year_int_for_bc if orig_date_str.match(BC_REGEX)
89
-
90
- result = sortable_year_for_yyyy_or_yy
91
- result ||= sortable_year_for_decade # 19xx or 20xx
92
- result ||= sortable_year_for_century
93
- result ||= sortable_year_int_for_early_numeric
94
- unless result
95
- # try removing brackets between digits in case we have 169[5] or [18]91
96
- no_brackets = remove_brackets
97
- return DateParsing.new(no_brackets).year_int_from_date_str if no_brackets
98
- end
99
- result.to_i if result && self.class.year_int_valid?(result.to_i)
47
+ xml&.as_object&.as_range&.first&.year
100
48
  end
101
49
 
102
50
  # get String sortable value year if we can parse date_str to get a year.
@@ -105,222 +53,17 @@ module Stanford
105
53
  # @return [String, nil] String sortable year if we could parse one, nil otherwise
106
54
  # note that these values must *lexically* sort to create a chronological sort.
107
55
  def sortable_year_string_from_date_str
108
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
109
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
110
- return sortable_year_str_for_bc if orig_date_str.match(BC_REGEX)
111
-
112
- result = sortable_year_for_yyyy_or_yy
113
- result ||= sortable_year_for_decade # 19xx or 20xx
114
- result ||= sortable_year_for_century
115
- result ||= sortable_year_str_for_early_numeric
116
- unless result
117
- # try removing brackets between digits in case we have 169[5] or [18]91
118
- no_brackets = remove_brackets
119
- return DateParsing.new(no_brackets).sortable_year_string_from_date_str if no_brackets
120
- end
121
- result if self.class.year_str_valid?(result)
122
- end
123
-
124
- # get String sortable value year if we can parse date_str to get a year.
125
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
126
- # note that these values must *lexically* sort to create a chronological sort.
127
- def sortable_year_for_yyyy_or_yy
128
- # most date strings have a four digit year
129
- result = sortable_year_for_yyyy
130
- result ||= sortable_year_for_yy # 19xx or 20xx
131
- result
132
- end
133
-
134
- # removes brackets between digits such as 169[5] or [18]91
135
- def remove_brackets
136
- orig_date_str.delete('[]') if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
137
- end
138
-
139
- # looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
140
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
141
- def sortable_year_for_yyyy
142
- matches = orig_date_str.match(/\d{4}/) if orig_date_str
143
- matches.to_s if matches
144
- end
56
+ return unless xml&.as_object&.date
145
57
 
146
- # returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
147
- # note that these are the only 2 digit year patterns found in our actual date strings in MODS records
148
- # we use 20 as century digits unless it is greater than current year:
149
- # 1/1/15 -> 2015
150
- # 1/1/25 -> 1925
151
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
152
- def sortable_year_for_yy
153
- return unless orig_date_str
58
+ date = xml.as_object.date
154
59
 
155
- slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
156
- if slash_matches
157
- date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
60
+ if date.is_a?(EDTF::Interval) && date.from.year < 1
61
+ (-1 * date.from.year - 1000).to_s
62
+ elsif date.is_a?(Date) && date.year < 1
63
+ (-1 * date.year - 1000).to_s
158
64
  else
159
- hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
160
- date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
65
+ date.to_s[0..3]&.gsub('X', '-')
161
66
  end
162
- if date_obj && date_obj > Date.today
163
- date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
164
- end
165
- date_obj.year.to_s if date_obj
166
- rescue ArgumentError
167
- nil # explicitly want nil if date won't parse
168
- end
169
-
170
- DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
171
-
172
- # get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
173
- # note that these are the only decade patterns found in our actual date strings in MODS records
174
- # @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
175
- def sortable_year_for_decade
176
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
177
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
178
- DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
179
- end
180
-
181
- DECADE_S_REGEXP = Regexp.new('\d{3}0\'?s')
182
-
183
- # get, e.g. 1950s, if we have: yyyu, yyy-, yyy? or yyyx pattern or yyy0s or yyy0's
184
- # note that these are the only decade patterns found in our actual date strings in MODS records
185
- # @return [String, nil] 4 digit year with s (e.g. 1860s, 1950s) if orig_date_str matches pattern, nil otherwise
186
- def display_str_for_decade
187
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
188
- if decade_matches
189
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
190
- zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
191
- return "#{zeroth_year}s" if zeroth_year
192
- else
193
- decade_matches = orig_date_str.match(DECADE_S_REGEXP) if orig_date_str
194
- return decade_matches.to_s.tr("'", '') if decade_matches
195
- end
196
- end
197
-
198
- CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
199
- CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
200
-
201
- # get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
202
- # note that these are the only century patterns found in our actual date strings in MODS records
203
- # @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
204
- def sortable_year_for_century
205
- return unless orig_date_str
206
- return if orig_date_str =~ /B\.C\./
207
-
208
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
209
- if century_matches
210
- return $1 + '00' if $1.length == 2
211
- return '0' + $1 + '00' if $1.length == 1
212
- end
213
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
214
- if century_str_matches
215
- yy = ($1.to_i - 1).to_s
216
- return yy + '00' if yy.length == 2
217
- return '0' + yy + '00' if yy.length == 1
218
- end
219
- end
220
-
221
- # get display value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
222
- # note that these are the only century patterns found in our actual date strings in MODS records
223
- # @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
224
- def display_str_for_century
225
- return unless orig_date_str
226
- return if orig_date_str =~ /B\.C\./
227
-
228
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
229
- return century_str_matches.to_s if century_str_matches
230
-
231
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
232
- if century_matches
233
- return "#{($1.to_i + 1).ordinalize} century"
234
- end
235
- end
236
-
237
- BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
238
-
239
- # get String sortable value for B.C. if we have B.C. pattern
240
- # note that these values must *lexically* sort to create a chronological sort.
241
- # We know our data does not contain B.C. dates older than 999, so we can make them
242
- # lexically sort by subtracting 1000. So we get:
243
- # -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
244
- # @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
245
- def sortable_year_str_for_bc
246
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
247
- ($1.to_i - 1000).to_s if bc_matches
248
- end
249
-
250
- # get Integer sortable value for B.C. if we have B.C. pattern
251
- # @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
252
- def sortable_year_int_for_bc
253
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
254
- "-#{$1}".to_i if bc_matches
255
- end
256
-
257
- # get display value for B.C. if we have B.C. pattern
258
- # @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
259
- def display_str_for_bc
260
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
261
- bc_matches.to_s if bc_matches
262
- end
263
-
264
- EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
265
-
266
- # get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
267
- # note that these values must *lexically* sort to create a chronological sort.
268
- # We know our data does not contain negative dates older than -999, so we can make them
269
- # lexically sort by subtracting 1000. So we get:
270
- # -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
271
- # @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
272
- def sortable_year_str_for_early_numeric
273
- return unless orig_date_str.match(EARLY_NUMERIC)
274
-
275
- if orig_date_str =~ /^\-/
276
- # negative number becomes x - 1000 for sorting; -005 for -995
277
- num = orig_date_str[1..-1].to_i - 1000
278
- return '-' + num.to_s[1..-1].rjust(3, '0')
279
- else
280
- return orig_date_str.rjust(4, '0')
281
- end
282
- end
283
-
284
- # get Integer sortable value from date String containing yyy, yy, y, -y, -yy, -yyy, -yyyy
285
- # @return [Integer, nil] Integer sortable -ddd if orig_date_str matches pattern; nil otherwise
286
- def sortable_year_int_for_early_numeric
287
- return orig_date_str.to_i if orig_date_str.match(EARLY_NUMERIC)
288
-
289
- orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
290
- end
291
-
292
- # get display value for date String containing yyy, yy, y, -y, -yy, -yyy
293
- # negative number strings will be changed to B.C. strings
294
- # note that there is no year 0: from https://en.wikipedia.org/wiki/Anno_Domini
295
- # "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
296
- # There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
297
- # See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
298
- def display_str_for_early_numeric
299
- return unless orig_date_str.match(EARLY_NUMERIC)
300
- # return 1 B.C. when the date is 0 since there is no 0 year
301
- return '1 B.C.' if orig_date_str == '0'
302
- # negative number becomes B.C.
303
- return "#{orig_date_str[1..-1].to_i + 1} B.C." if orig_date_str =~ /^\-/
304
-
305
- # remove leading 0s from early dates
306
- "#{orig_date_str.to_i} A.D."
307
- end
308
-
309
- # NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
310
- # actually works for nearly all those cases and a lot more besides. Trial and error
311
- # with an extensive set of test data culled from actual date strings in our MODS records
312
- # has made this method bogus.
313
- # @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
314
- def year_via_ruby_parsing
315
- return unless orig_date_str =~ /\d\d/ # need at least 2 digits
316
- # need more in string than only 2 digits
317
- return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
318
- return if orig_date_str =~ /\d\s*B.C./ # skip B.C. dates
319
-
320
- date_obj = Date.parse(orig_date_str)
321
- date_obj.year.to_s
322
- rescue ArgumentError
323
- nil # explicitly want nil if date won't parse
324
67
  end
325
68
  end
326
69
  end