stanford-mods 2.6.4 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +1 -1
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +24 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -289
  12. data/lib/stanford-mods/imprint.rb +170 -322
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
  16. data/lib/stanford-mods.rb +12 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +263 -207
  22. data/spec/lib/stanford-mods/coordinate_spec.rb +3 -5
  23. data/spec/name_spec.rb +26 -230
  24. data/spec/origin_info_spec.rb +34 -300
  25. data/spec/searchworks_basic_spec.rb +1 -3
  26. data/spec/searchworks_pub_dates_spec.rb +0 -215
  27. data/spec/searchworks_spec.rb +0 -21
  28. data/spec/searchworks_subject_raw_spec.rb +106 -105
  29. data/spec/searchworks_subject_spec.rb +19 -55
  30. data/spec/searchworks_title_spec.rb +5 -5
  31. data/stanford-mods.gemspec +1 -1
  32. metadata +19 -15
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,102 +1,50 @@
1
- require 'active_support/core_ext/integer/inflections'
2
-
3
1
  module Stanford
4
2
  module Mods
5
- # Parsing date strings
6
- # TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
7
- # When this is "gemified":
8
- # - we may want an integer or date sort field as well as lexical
9
- # - we could add methods like my_date.bc?
10
3
  class DateParsing
11
- # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
12
- # @return [String, nil] display value for year if we could parse one, nil otherwise
13
- def self.date_str_for_display(date_str)
14
- DateParsing.new(date_str).date_str_for_display
15
- end
16
-
17
- # get year as Integer if we can parse date_str to get a year.
18
- # @return [Integer, nil] Integer year if we could parse one, nil otherwise
19
- def self.year_int_from_date_str(date_str)
20
- DateParsing.new(date_str).year_int_from_date_str
21
- end
22
-
23
- # get String sortable value year if we can parse date_str to get a year.
24
- # SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
25
- # The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
26
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
27
- # note that these values must *lexically* sort to create a chronological sort.
28
- def self.sortable_year_string_from_date_str(date_str)
29
- DateParsing.new(date_str).sortable_year_string_from_date_str
30
- end
31
-
32
- # true if the year is between -999 and (current year + 1)
33
- # @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
34
- # @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
35
- def self.year_str_valid?(year_str)
36
- return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
37
-
38
- (-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
39
- end
40
-
41
4
  # true if the year is between -9999 and (current year + 1)
42
5
  # @return [Boolean] true if the year is between -9999 and (current year + 1); false otherwise
43
6
  def self.year_int_valid?(year)
44
7
  return false unless year.is_a? Integer
45
8
 
46
- (-1000 < year.to_i) && (year < Date.today.year + 2)
9
+ (year < Date.today.year + 2)
47
10
  end
48
11
 
49
- attr_reader :orig_date_str
12
+ attr_reader :xml
50
13
 
51
- def initialize(date_str)
52
- @orig_date_str = date_str
53
- @orig_date_str.freeze
14
+ def initialize(xml)
15
+ @xml = xml
54
16
  end
55
17
 
56
- BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
57
-
58
18
  # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
59
19
  # @return [String, nil] String value for year if we could parse one, nil otherwise
60
20
  def date_str_for_display
61
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
62
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
63
- return display_str_for_bc if orig_date_str.match(BC_REGEX)
64
- # decade next in case there are 4 digits, e.g. 1950s
65
- return display_str_for_decade if orig_date_str.match(DECADE_4CHAR_REGEXP) || orig_date_str.match(DECADE_S_REGEXP)
66
-
67
- result = sortable_year_for_yyyy_or_yy
68
- unless result
69
- # try removing brackets between digits in case we have 169[5] or [18]91
70
- no_brackets = remove_brackets
71
- return DateParsing.new(no_brackets).date_str_for_display if no_brackets
72
- end
73
- # parsing below this line gives string inapprop for year_str_valid?
74
- unless self.class.year_str_valid?(result)
75
- result = display_str_for_century
76
- result ||= display_str_for_early_numeric
21
+ date = xml&.as_object&.date
22
+ date = date.min || date.max if date.is_a?(EDTF::Epoch) || date.is_a?(EDTF::Interval)
23
+
24
+ return case xml.as_object.precision
25
+ when :century
26
+ return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
27
+ when :decade
28
+ return "#{date.year}s"
29
+ when :unknown
30
+ xml.text
31
+ else
32
+ if !self.class.year_int_valid? date.year
33
+ xml.text
34
+ elsif date.year < 1
35
+ "#{date.year.abs + 1} B.C."
36
+ elsif date.year < 1000
37
+ "#{date.year} A.D."
38
+ else
39
+ date.year.to_s
40
+ end
77
41
  end
78
- # remove leading 0s from early dates
79
- result = "#{result.to_i} A.D." if result && result.match(/^0\d+$/)
80
- result
81
42
  end
82
43
 
83
44
  # get Integer year if we can parse date_str to get a year.
84
45
  # @return [Integer, nil] Integer year if we could parse one, nil otherwise
85
46
  def year_int_from_date_str
86
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
87
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
88
- return sortable_year_int_for_bc if orig_date_str.match(BC_REGEX)
89
-
90
- result = sortable_year_for_yyyy_or_yy
91
- result ||= sortable_year_for_decade # 19xx or 20xx
92
- result ||= sortable_year_for_century
93
- result ||= sortable_year_int_for_early_numeric
94
- unless result
95
- # try removing brackets between digits in case we have 169[5] or [18]91
96
- no_brackets = remove_brackets
97
- return DateParsing.new(no_brackets).year_int_from_date_str if no_brackets
98
- end
99
- result.to_i if result && self.class.year_int_valid?(result.to_i)
47
+ xml&.as_object&.as_range&.first&.year
100
48
  end
101
49
 
102
50
  # get String sortable value year if we can parse date_str to get a year.
@@ -105,222 +53,17 @@ module Stanford
105
53
  # @return [String, nil] String sortable year if we could parse one, nil otherwise
106
54
  # note that these values must *lexically* sort to create a chronological sort.
107
55
  def sortable_year_string_from_date_str
108
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
109
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
110
- return sortable_year_str_for_bc if orig_date_str.match(BC_REGEX)
111
-
112
- result = sortable_year_for_yyyy_or_yy
113
- result ||= sortable_year_for_decade # 19xx or 20xx
114
- result ||= sortable_year_for_century
115
- result ||= sortable_year_str_for_early_numeric
116
- unless result
117
- # try removing brackets between digits in case we have 169[5] or [18]91
118
- no_brackets = remove_brackets
119
- return DateParsing.new(no_brackets).sortable_year_string_from_date_str if no_brackets
120
- end
121
- result if self.class.year_str_valid?(result)
122
- end
123
-
124
- # get String sortable value year if we can parse date_str to get a year.
125
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
126
- # note that these values must *lexically* sort to create a chronological sort.
127
- def sortable_year_for_yyyy_or_yy
128
- # most date strings have a four digit year
129
- result = sortable_year_for_yyyy
130
- result ||= sortable_year_for_yy # 19xx or 20xx
131
- result
132
- end
133
-
134
- # removes brackets between digits such as 169[5] or [18]91
135
- def remove_brackets
136
- orig_date_str.delete('[]') if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
137
- end
138
-
139
- # looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
140
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
141
- def sortable_year_for_yyyy
142
- matches = orig_date_str.match(/\d{4}/) if orig_date_str
143
- matches.to_s if matches
144
- end
56
+ return unless xml&.as_object&.date
145
57
 
146
- # returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
147
- # note that these are the only 2 digit year patterns found in our actual date strings in MODS records
148
- # we use 20 as century digits unless it is greater than current year:
149
- # 1/1/15 -> 2015
150
- # 1/1/25 -> 1925
151
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
152
- def sortable_year_for_yy
153
- return unless orig_date_str
58
+ date = xml.as_object.date
154
59
 
155
- slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
156
- if slash_matches
157
- date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
60
+ if date.is_a?(EDTF::Interval) && date.from.year < 1
61
+ (-1 * date.from.year - 1000).to_s
62
+ elsif date.is_a?(Date) && date.year < 1
63
+ (-1 * date.year - 1000).to_s
158
64
  else
159
- hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
160
- date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
65
+ date.to_s[0..3]&.gsub('X', '-')
161
66
  end
162
- if date_obj && date_obj > Date.today
163
- date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
164
- end
165
- date_obj.year.to_s if date_obj
166
- rescue ArgumentError
167
- nil # explicitly want nil if date won't parse
168
- end
169
-
170
- DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
171
-
172
- # get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
173
- # note that these are the only decade patterns found in our actual date strings in MODS records
174
- # @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
175
- def sortable_year_for_decade
176
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
177
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
178
- DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
179
- end
180
-
181
- DECADE_S_REGEXP = Regexp.new('\d{3}0\'?s')
182
-
183
- # get, e.g. 1950s, if we have: yyyu, yyy-, yyy? or yyyx pattern or yyy0s or yyy0's
184
- # note that these are the only decade patterns found in our actual date strings in MODS records
185
- # @return [String, nil] 4 digit year with s (e.g. 1860s, 1950s) if orig_date_str matches pattern, nil otherwise
186
- def display_str_for_decade
187
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
188
- if decade_matches
189
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
190
- zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
191
- return "#{zeroth_year}s" if zeroth_year
192
- else
193
- decade_matches = orig_date_str.match(DECADE_S_REGEXP) if orig_date_str
194
- return decade_matches.to_s.tr("'", '') if decade_matches
195
- end
196
- end
197
-
198
- CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
199
- CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
200
-
201
- # get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
202
- # note that these are the only century patterns found in our actual date strings in MODS records
203
- # @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
204
- def sortable_year_for_century
205
- return unless orig_date_str
206
- return if orig_date_str =~ /B\.C\./
207
-
208
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
209
- if century_matches
210
- return $1 + '00' if $1.length == 2
211
- return '0' + $1 + '00' if $1.length == 1
212
- end
213
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
214
- if century_str_matches
215
- yy = ($1.to_i - 1).to_s
216
- return yy + '00' if yy.length == 2
217
- return '0' + yy + '00' if yy.length == 1
218
- end
219
- end
220
-
221
- # get display value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
222
- # note that these are the only century patterns found in our actual date strings in MODS records
223
- # @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
224
- def display_str_for_century
225
- return unless orig_date_str
226
- return if orig_date_str =~ /B\.C\./
227
-
228
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
229
- return century_str_matches.to_s if century_str_matches
230
-
231
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
232
- if century_matches
233
- return "#{($1.to_i + 1).ordinalize} century"
234
- end
235
- end
236
-
237
- BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
238
-
239
- # get String sortable value for B.C. if we have B.C. pattern
240
- # note that these values must *lexically* sort to create a chronological sort.
241
- # We know our data does not contain B.C. dates older than 999, so we can make them
242
- # lexically sort by subtracting 1000. So we get:
243
- # -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
244
- # @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
245
- def sortable_year_str_for_bc
246
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
247
- ($1.to_i - 1000).to_s if bc_matches
248
- end
249
-
250
- # get Integer sortable value for B.C. if we have B.C. pattern
251
- # @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
252
- def sortable_year_int_for_bc
253
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
254
- "-#{$1}".to_i if bc_matches
255
- end
256
-
257
- # get display value for B.C. if we have B.C. pattern
258
- # @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
259
- def display_str_for_bc
260
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
261
- bc_matches.to_s if bc_matches
262
- end
263
-
264
- EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
265
-
266
- # get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
267
- # note that these values must *lexically* sort to create a chronological sort.
268
- # We know our data does not contain negative dates older than -999, so we can make them
269
- # lexically sort by subtracting 1000. So we get:
270
- # -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
271
- # @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
272
- def sortable_year_str_for_early_numeric
273
- return unless orig_date_str.match(EARLY_NUMERIC)
274
-
275
- if orig_date_str =~ /^\-/
276
- # negative number becomes x - 1000 for sorting; -005 for -995
277
- num = orig_date_str[1..-1].to_i - 1000
278
- return '-' + num.to_s[1..-1].rjust(3, '0')
279
- else
280
- return orig_date_str.rjust(4, '0')
281
- end
282
- end
283
-
284
- # get Integer sortable value from date String containing yyy, yy, y, -y, -yy, -yyy, -yyyy
285
- # @return [Integer, nil] Integer sortable -ddd if orig_date_str matches pattern; nil otherwise
286
- def sortable_year_int_for_early_numeric
287
- return orig_date_str.to_i if orig_date_str.match(EARLY_NUMERIC)
288
-
289
- orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
290
- end
291
-
292
- # get display value for date String containing yyy, yy, y, -y, -yy, -yyy
293
- # negative number strings will be changed to B.C. strings
294
- # note that there is no year 0: from https://en.wikipedia.org/wiki/Anno_Domini
295
- # "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
296
- # There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
297
- # See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
298
- def display_str_for_early_numeric
299
- return unless orig_date_str.match(EARLY_NUMERIC)
300
- # return 1 B.C. when the date is 0 since there is no 0 year
301
- return '1 B.C.' if orig_date_str == '0'
302
- # negative number becomes B.C.
303
- return "#{orig_date_str[1..-1].to_i + 1} B.C." if orig_date_str =~ /^\-/
304
-
305
- # remove leading 0s from early dates
306
- "#{orig_date_str.to_i} A.D."
307
- end
308
-
309
- # NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
310
- # actually works for nearly all those cases and a lot more besides. Trial and error
311
- # with an extensive set of test data culled from actual date strings in our MODS records
312
- # has made this method bogus.
313
- # @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
314
- def year_via_ruby_parsing
315
- return unless orig_date_str =~ /\d\d/ # need at least 2 digits
316
- # need more in string than only 2 digits
317
- return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
318
- return if orig_date_str =~ /\d\s*B.C./ # skip B.C. dates
319
-
320
- date_obj = Date.parse(orig_date_str)
321
- date_obj.year.to_s
322
- rescue ArgumentError
323
- nil # explicitly want nil if date won't parse
324
67
  end
325
68
  end
326
69
  end