stanford-mods 2.6.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +24 -0
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +21 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -288
  12. data/lib/stanford-mods/imprint.rb +149 -325
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +2 -0
  16. data/lib/stanford-mods.rb +13 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +238 -207
  22. data/spec/name_spec.rb +28 -232
  23. data/spec/origin_info_spec.rb +34 -300
  24. data/spec/searchworks_basic_spec.rb +1 -3
  25. data/spec/searchworks_pub_dates_spec.rb +0 -215
  26. data/spec/searchworks_spec.rb +0 -21
  27. data/spec/searchworks_subject_raw_spec.rb +106 -105
  28. data/spec/searchworks_subject_spec.rb +19 -55
  29. data/spec/searchworks_title_spec.rb +5 -5
  30. data/stanford-mods.gemspec +1 -1
  31. metadata +24 -20
  32. data/.travis.yml +0 -17
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,100 +1,50 @@
1
1
  module Stanford
2
2
  module Mods
3
- # Parsing date strings
4
- # TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
5
- # When this is "gemified":
6
- # - we may want an integer or date sort field as well as lexical
7
- # - we could add methods like my_date.bc?
8
3
  class DateParsing
9
- # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
10
- # @return [String, nil] display value for year if we could parse one, nil otherwise
11
- def self.date_str_for_display(date_str)
12
- DateParsing.new(date_str).date_str_for_display
13
- end
14
-
15
- # get year as Integer if we can parse date_str to get a year.
16
- # @return [Integer, nil] Integer year if we could parse one, nil otherwise
17
- def self.year_int_from_date_str(date_str)
18
- DateParsing.new(date_str).year_int_from_date_str
19
- end
20
-
21
- # get String sortable value year if we can parse date_str to get a year.
22
- # SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
23
- # The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
24
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
25
- # note that these values must *lexically* sort to create a chronological sort.
26
- def self.sortable_year_string_from_date_str(date_str)
27
- DateParsing.new(date_str).sortable_year_string_from_date_str
28
- end
29
-
30
- # true if the year is between -999 and (current year + 1)
31
- # @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
32
- # @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
33
- def self.year_str_valid?(year_str)
34
- return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
35
-
36
- (-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
37
- end
38
-
39
4
  # true if the year is between -9999 and (current year + 1)
40
5
  # @return [Boolean] true if the year is between -9999 and (current year + 1); false otherwise
41
6
  def self.year_int_valid?(year)
42
7
  return false unless year.is_a? Integer
43
8
 
44
- (-1000 < year.to_i) && (year < Date.today.year + 2)
9
+ (year < Date.today.year + 2)
45
10
  end
46
11
 
47
- attr_reader :orig_date_str
12
+ attr_reader :xml
48
13
 
49
- def initialize(date_str)
50
- @orig_date_str = date_str
51
- @orig_date_str.freeze
14
+ def initialize(xml)
15
+ @xml = xml
52
16
  end
53
17
 
54
- BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
55
-
56
18
  # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
57
19
  # @return [String, nil] String value for year if we could parse one, nil otherwise
58
20
  def date_str_for_display
59
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
60
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
61
- return display_str_for_bc if orig_date_str.match(BC_REGEX)
62
- # decade next in case there are 4 digits, e.g. 1950s
63
- return display_str_for_decade if orig_date_str.match(DECADE_4CHAR_REGEXP) || orig_date_str.match(DECADE_S_REGEXP)
64
-
65
- result = sortable_year_for_yyyy_or_yy
66
- unless result
67
- # try removing brackets between digits in case we have 169[5] or [18]91
68
- no_brackets = remove_brackets
69
- return DateParsing.new(no_brackets).date_str_for_display if no_brackets
70
- end
71
- # parsing below this line gives string inapprop for year_str_valid?
72
- unless self.class.year_str_valid?(result)
73
- result = display_str_for_century
74
- result ||= display_str_for_early_numeric
21
+ date = xml&.as_object&.date
22
+ date = date.min || date.max if date.is_a?(EDTF::Epoch) || date.is_a?(EDTF::Interval)
23
+
24
+ return case xml.as_object.precision
25
+ when :century
26
+ return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
27
+ when :decade
28
+ return "#{date.year}s"
29
+ when :unknown
30
+ xml.text
31
+ else
32
+ if !self.class.year_int_valid? date.year
33
+ xml.text
34
+ elsif date.year < 1
35
+ "#{date.year.abs + 1} B.C."
36
+ elsif date.year < 1000
37
+ "#{date.year} A.D."
38
+ else
39
+ date.year.to_s
40
+ end
75
41
  end
76
- # remove leading 0s from early dates
77
- result = "#{result.to_i} A.D." if result && result.match(/^0\d+$/)
78
- result
79
42
  end
80
43
 
81
44
  # get Integer year if we can parse date_str to get a year.
82
45
  # @return [Integer, nil] Integer year if we could parse one, nil otherwise
83
46
  def year_int_from_date_str
84
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
85
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
86
- return sortable_year_int_for_bc if orig_date_str.match(BC_REGEX)
87
-
88
- result = sortable_year_for_yyyy_or_yy
89
- result ||= sortable_year_for_decade # 19xx or 20xx
90
- result ||= sortable_year_for_century
91
- result ||= sortable_year_int_for_early_numeric
92
- unless result
93
- # try removing brackets between digits in case we have 169[5] or [18]91
94
- no_brackets = remove_brackets
95
- return DateParsing.new(no_brackets).year_int_from_date_str if no_brackets
96
- end
97
- result.to_i if result && self.class.year_int_valid?(result.to_i)
47
+ xml&.as_object&.as_range&.first&.year
98
48
  end
99
49
 
100
50
  # get String sortable value year if we can parse date_str to get a year.
@@ -103,224 +53,18 @@ module Stanford
103
53
  # @return [String, nil] String sortable year if we could parse one, nil otherwise
104
54
  # note that these values must *lexically* sort to create a chronological sort.
105
55
  def sortable_year_string_from_date_str
106
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
107
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
108
- return sortable_year_str_for_bc if orig_date_str.match(BC_REGEX)
109
-
110
- result = sortable_year_for_yyyy_or_yy
111
- result ||= sortable_year_for_decade # 19xx or 20xx
112
- result ||= sortable_year_for_century
113
- result ||= sortable_year_str_for_early_numeric
114
- unless result
115
- # try removing brackets between digits in case we have 169[5] or [18]91
116
- no_brackets = remove_brackets
117
- return DateParsing.new(no_brackets).sortable_year_string_from_date_str if no_brackets
118
- end
119
- result if self.class.year_str_valid?(result)
120
- end
121
-
122
- # get String sortable value year if we can parse date_str to get a year.
123
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
124
- # note that these values must *lexically* sort to create a chronological sort.
125
- def sortable_year_for_yyyy_or_yy
126
- # most date strings have a four digit year
127
- result = sortable_year_for_yyyy
128
- result ||= sortable_year_for_yy # 19xx or 20xx
129
- result
130
- end
131
-
132
- # removes brackets between digits such as 169[5] or [18]91
133
- def remove_brackets
134
- orig_date_str.delete('[]') if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
135
- end
136
-
137
- # looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
138
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
139
- def sortable_year_for_yyyy
140
- matches = orig_date_str.match(/\d{4}/) if orig_date_str
141
- matches.to_s if matches
142
- end
143
-
144
- # returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
145
- # note that these are the only 2 digit year patterns found in our actual date strings in MODS records
146
- # we use 20 as century digits unless it is greater than current year:
147
- # 1/1/15 -> 2015
148
- # 1/1/25 -> 1925
149
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
150
- def sortable_year_for_yy
151
- return unless orig_date_str
152
-
153
- slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
154
- if slash_matches
155
- date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
156
- else
157
- hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
158
- date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
159
- end
160
- if date_obj && date_obj > Date.today
161
- date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
162
- end
163
- date_obj.year.to_s if date_obj
164
- rescue ArgumentError
165
- nil # explicitly want nil if date won't parse
166
- end
167
-
168
- DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
56
+ return unless xml&.as_object&.date
169
57
 
170
- # get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
171
- # note that these are the only decade patterns found in our actual date strings in MODS records
172
- # @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
173
- def sortable_year_for_decade
174
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
175
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
176
- DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
177
- end
58
+ date = xml.as_object.date
178
59
 
179
- DECADE_S_REGEXP = Regexp.new('\d{3}0\'?s')
180
-
181
- # get, e.g. 1950s, if we have: yyyu, yyy-, yyy? or yyyx pattern or yyy0s or yyy0's
182
- # note that these are the only decade patterns found in our actual date strings in MODS records
183
- # @return [String, nil] 4 digit year with s (e.g. 1860s, 1950s) if orig_date_str matches pattern, nil otherwise
184
- def display_str_for_decade
185
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
186
- if decade_matches
187
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
188
- zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
189
- return "#{zeroth_year}s" if zeroth_year
60
+ if date.is_a?(EDTF::Interval) && date.from.year < 1
61
+ (-1 * date.from.year - 1000).to_s
62
+ elsif date.is_a?(Date) && date.year < 1
63
+ (-1 * date.year - 1000).to_s
190
64
  else
191
- decade_matches = orig_date_str.match(DECADE_S_REGEXP) if orig_date_str
192
- return decade_matches.to_s.tr("'", '') if decade_matches
193
- end
194
- end
195
-
196
- CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
197
- CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
198
-
199
- # get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
200
- # note that these are the only century patterns found in our actual date strings in MODS records
201
- # @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
202
- def sortable_year_for_century
203
- return unless orig_date_str
204
- return if orig_date_str =~ /B\.C\./
205
-
206
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
207
- if century_matches
208
- return $1 + '00' if $1.length == 2
209
- return '0' + $1 + '00' if $1.length == 1
210
- end
211
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
212
- if century_str_matches
213
- yy = ($1.to_i - 1).to_s
214
- return yy + '00' if yy.length == 2
215
- return '0' + yy + '00' if yy.length == 1
65
+ date.to_s[0..3]&.gsub('X', '-')
216
66
  end
217
67
  end
218
-
219
- # get display value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
220
- # note that these are the only century patterns found in our actual date strings in MODS records
221
- # @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
222
- def display_str_for_century
223
- return unless orig_date_str
224
- return if orig_date_str =~ /B\.C\./
225
-
226
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
227
- return century_str_matches.to_s if century_str_matches
228
-
229
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
230
- if century_matches
231
- require 'active_support/core_ext/integer/inflections'
232
- return "#{($1.to_i + 1).ordinalize} century"
233
- end
234
- end
235
-
236
- BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
237
-
238
- # get String sortable value for B.C. if we have B.C. pattern
239
- # note that these values must *lexically* sort to create a chronological sort.
240
- # We know our data does not contain B.C. dates older than 999, so we can make them
241
- # lexically sort by subtracting 1000. So we get:
242
- # -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
243
- # @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
244
- def sortable_year_str_for_bc
245
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
246
- ($1.to_i - 1000).to_s if bc_matches
247
- end
248
-
249
- # get Integer sortable value for B.C. if we have B.C. pattern
250
- # @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
251
- def sortable_year_int_for_bc
252
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
253
- "-#{$1}".to_i if bc_matches
254
- end
255
-
256
- # get display value for B.C. if we have B.C. pattern
257
- # @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
258
- def display_str_for_bc
259
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
260
- bc_matches.to_s if bc_matches
261
- end
262
-
263
- EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
264
-
265
- # get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
266
- # note that these values must *lexically* sort to create a chronological sort.
267
- # We know our data does not contain negative dates older than -999, so we can make them
268
- # lexically sort by subtracting 1000. So we get:
269
- # -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
270
- # @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
271
- def sortable_year_str_for_early_numeric
272
- return unless orig_date_str.match(EARLY_NUMERIC)
273
-
274
- if orig_date_str =~ /^\-/
275
- # negative number becomes x - 1000 for sorting; -005 for -995
276
- num = orig_date_str[1..-1].to_i - 1000
277
- return '-' + num.to_s[1..-1].rjust(3, '0')
278
- else
279
- return orig_date_str.rjust(4, '0')
280
- end
281
- end
282
-
283
- # get Integer sortable value from date String containing yyy, yy, y, -y, -yy, -yyy, -yyyy
284
- # @return [Integer, nil] Integer sortable -ddd if orig_date_str matches pattern; nil otherwise
285
- def sortable_year_int_for_early_numeric
286
- return orig_date_str.to_i if orig_date_str.match(EARLY_NUMERIC)
287
-
288
- orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
289
- end
290
-
291
- # get display value for date String containing yyy, yy, y, -y, -yy, -yyy
292
- # negative number strings will be changed to B.C. strings
293
- # note that there is no year 0: from https://en.wikipedia.org/wiki/Anno_Domini
294
- # "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
295
- # There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
296
- # See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
297
- def display_str_for_early_numeric
298
- return unless orig_date_str.match(EARLY_NUMERIC)
299
- # return 1 B.C. when the date is 0 since there is no 0 year
300
- return '1 B.C.' if orig_date_str == '0'
301
- # negative number becomes B.C.
302
- return "#{orig_date_str[1..-1].to_i + 1} B.C." if orig_date_str =~ /^\-/
303
-
304
- # remove leading 0s from early dates
305
- "#{orig_date_str.to_i} A.D."
306
- end
307
-
308
- # NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
309
- # actually works for nearly all those cases and a lot more besides. Trial and error
310
- # with an extensive set of test data culled from actual date strings in our MODS records
311
- # has made this method bogus.
312
- # @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
313
- def year_via_ruby_parsing
314
- return unless orig_date_str =~ /\d\d/ # need at least 2 digits
315
- # need more in string than only 2 digits
316
- return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
317
- return if orig_date_str =~ /\d\s*B.C./ # skip B.C. dates
318
-
319
- date_obj = Date.parse(orig_date_str)
320
- date_obj.year.to_s
321
- rescue ArgumentError
322
- nil # explicitly want nil if date won't parse
323
- end
324
68
  end
325
69
  end
326
70
  end