stanford-mods 2.6.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +24 -0
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +21 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -288
  12. data/lib/stanford-mods/imprint.rb +149 -325
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +2 -0
  16. data/lib/stanford-mods.rb +13 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +238 -207
  22. data/spec/name_spec.rb +28 -232
  23. data/spec/origin_info_spec.rb +34 -300
  24. data/spec/searchworks_basic_spec.rb +1 -3
  25. data/spec/searchworks_pub_dates_spec.rb +0 -215
  26. data/spec/searchworks_spec.rb +0 -21
  27. data/spec/searchworks_subject_raw_spec.rb +106 -105
  28. data/spec/searchworks_subject_spec.rb +19 -55
  29. data/spec/searchworks_title_spec.rb +5 -5
  30. data/stanford-mods.gemspec +1 -1
  31. metadata +24 -20
  32. data/.travis.yml +0 -17
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,100 +1,50 @@
1
1
  module Stanford
2
2
  module Mods
3
- # Parsing date strings
4
- # TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
5
- # When this is "gemified":
6
- # - we may want an integer or date sort field as well as lexical
7
- # - we could add methods like my_date.bc?
8
3
  class DateParsing
9
- # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
10
- # @return [String, nil] display value for year if we could parse one, nil otherwise
11
- def self.date_str_for_display(date_str)
12
- DateParsing.new(date_str).date_str_for_display
13
- end
14
-
15
- # get year as Integer if we can parse date_str to get a year.
16
- # @return [Integer, nil] Integer year if we could parse one, nil otherwise
17
- def self.year_int_from_date_str(date_str)
18
- DateParsing.new(date_str).year_int_from_date_str
19
- end
20
-
21
- # get String sortable value year if we can parse date_str to get a year.
22
- # SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
23
- # The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
24
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
25
- # note that these values must *lexically* sort to create a chronological sort.
26
- def self.sortable_year_string_from_date_str(date_str)
27
- DateParsing.new(date_str).sortable_year_string_from_date_str
28
- end
29
-
30
- # true if the year is between -999 and (current year + 1)
31
- # @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
32
- # @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
33
- def self.year_str_valid?(year_str)
34
- return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
35
-
36
- (-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
37
- end
38
-
39
4
  # true if the year is between -9999 and (current year + 1)
40
5
  # @return [Boolean] true if the year is between -9999 and (current year + 1); false otherwise
41
6
  def self.year_int_valid?(year)
42
7
  return false unless year.is_a? Integer
43
8
 
44
- (-1000 < year.to_i) && (year < Date.today.year + 2)
9
+ (year < Date.today.year + 2)
45
10
  end
46
11
 
47
- attr_reader :orig_date_str
12
+ attr_reader :xml
48
13
 
49
- def initialize(date_str)
50
- @orig_date_str = date_str
51
- @orig_date_str.freeze
14
+ def initialize(xml)
15
+ @xml = xml
52
16
  end
53
17
 
54
- BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
55
-
56
18
  # get display value for year, generally an explicit year or "17th century" or "5 B.C." or "1950s" or '845 A.D.'
57
19
  # @return [String, nil] String value for year if we could parse one, nil otherwise
58
20
  def date_str_for_display
59
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
60
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
61
- return display_str_for_bc if orig_date_str.match(BC_REGEX)
62
- # decade next in case there are 4 digits, e.g. 1950s
63
- return display_str_for_decade if orig_date_str.match(DECADE_4CHAR_REGEXP) || orig_date_str.match(DECADE_S_REGEXP)
64
-
65
- result = sortable_year_for_yyyy_or_yy
66
- unless result
67
- # try removing brackets between digits in case we have 169[5] or [18]91
68
- no_brackets = remove_brackets
69
- return DateParsing.new(no_brackets).date_str_for_display if no_brackets
70
- end
71
- # parsing below this line gives string inapprop for year_str_valid?
72
- unless self.class.year_str_valid?(result)
73
- result = display_str_for_century
74
- result ||= display_str_for_early_numeric
21
+ date = xml&.as_object&.date
22
+ date = date.min || date.max if date.is_a?(EDTF::Epoch) || date.is_a?(EDTF::Interval)
23
+
24
+ return case xml.as_object.precision
25
+ when :century
26
+ return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
27
+ when :decade
28
+ return "#{date.year}s"
29
+ when :unknown
30
+ xml.text
31
+ else
32
+ if !self.class.year_int_valid? date.year
33
+ xml.text
34
+ elsif date.year < 1
35
+ "#{date.year.abs + 1} B.C."
36
+ elsif date.year < 1000
37
+ "#{date.year} A.D."
38
+ else
39
+ date.year.to_s
40
+ end
75
41
  end
76
- # remove leading 0s from early dates
77
- result = "#{result.to_i} A.D." if result && result.match(/^0\d+$/)
78
- result
79
42
  end
80
43
 
81
44
  # get Integer year if we can parse date_str to get a year.
82
45
  # @return [Integer, nil] Integer year if we could parse one, nil otherwise
83
46
  def year_int_from_date_str
84
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
85
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
86
- return sortable_year_int_for_bc if orig_date_str.match(BC_REGEX)
87
-
88
- result = sortable_year_for_yyyy_or_yy
89
- result ||= sortable_year_for_decade # 19xx or 20xx
90
- result ||= sortable_year_for_century
91
- result ||= sortable_year_int_for_early_numeric
92
- unless result
93
- # try removing brackets between digits in case we have 169[5] or [18]91
94
- no_brackets = remove_brackets
95
- return DateParsing.new(no_brackets).year_int_from_date_str if no_brackets
96
- end
97
- result.to_i if result && self.class.year_int_valid?(result.to_i)
47
+ xml&.as_object&.as_range&.first&.year
98
48
  end
99
49
 
100
50
  # get String sortable value year if we can parse date_str to get a year.
@@ -103,224 +53,18 @@ module Stanford
103
53
  # @return [String, nil] String sortable year if we could parse one, nil otherwise
104
54
  # note that these values must *lexically* sort to create a chronological sort.
105
55
  def sortable_year_string_from_date_str
106
- return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
107
- # B.C. first in case there are 4 digits, e.g. 1600 B.C.
108
- return sortable_year_str_for_bc if orig_date_str.match(BC_REGEX)
109
-
110
- result = sortable_year_for_yyyy_or_yy
111
- result ||= sortable_year_for_decade # 19xx or 20xx
112
- result ||= sortable_year_for_century
113
- result ||= sortable_year_str_for_early_numeric
114
- unless result
115
- # try removing brackets between digits in case we have 169[5] or [18]91
116
- no_brackets = remove_brackets
117
- return DateParsing.new(no_brackets).sortable_year_string_from_date_str if no_brackets
118
- end
119
- result if self.class.year_str_valid?(result)
120
- end
121
-
122
- # get String sortable value year if we can parse date_str to get a year.
123
- # @return [String, nil] String sortable year if we could parse one, nil otherwise
124
- # note that these values must *lexically* sort to create a chronological sort.
125
- def sortable_year_for_yyyy_or_yy
126
- # most date strings have a four digit year
127
- result = sortable_year_for_yyyy
128
- result ||= sortable_year_for_yy # 19xx or 20xx
129
- result
130
- end
131
-
132
- # removes brackets between digits such as 169[5] or [18]91
133
- def remove_brackets
134
- orig_date_str.delete('[]') if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
135
- end
136
-
137
- # looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
138
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
139
- def sortable_year_for_yyyy
140
- matches = orig_date_str.match(/\d{4}/) if orig_date_str
141
- matches.to_s if matches
142
- end
143
-
144
- # returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
145
- # note that these are the only 2 digit year patterns found in our actual date strings in MODS records
146
- # we use 20 as century digits unless it is greater than current year:
147
- # 1/1/15 -> 2015
148
- # 1/1/25 -> 1925
149
- # @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
150
- def sortable_year_for_yy
151
- return unless orig_date_str
152
-
153
- slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
154
- if slash_matches
155
- date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
156
- else
157
- hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
158
- date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
159
- end
160
- if date_obj && date_obj > Date.today
161
- date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
162
- end
163
- date_obj.year.to_s if date_obj
164
- rescue ArgumentError
165
- nil # explicitly want nil if date won't parse
166
- end
167
-
168
- DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
56
+ return unless xml&.as_object&.date
169
57
 
170
- # get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
171
- # note that these are the only decade patterns found in our actual date strings in MODS records
172
- # @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
173
- def sortable_year_for_decade
174
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
175
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
176
- DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
177
- end
58
+ date = xml.as_object.date
178
59
 
179
- DECADE_S_REGEXP = Regexp.new('\d{3}0\'?s')
180
-
181
- # get, e.g. 1950s, if we have: yyyu, yyy-, yyy? or yyyx pattern or yyy0s or yyy0's
182
- # note that these are the only decade patterns found in our actual date strings in MODS records
183
- # @return [String, nil] 4 digit year with s (e.g. 1860s, 1950s) if orig_date_str matches pattern, nil otherwise
184
- def display_str_for_decade
185
- decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
186
- if decade_matches
187
- changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
188
- zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
189
- return "#{zeroth_year}s" if zeroth_year
60
+ if date.is_a?(EDTF::Interval) && date.from.year < 1
61
+ (-1 * date.from.year - 1000).to_s
62
+ elsif date.is_a?(Date) && date.year < 1
63
+ (-1 * date.year - 1000).to_s
190
64
  else
191
- decade_matches = orig_date_str.match(DECADE_S_REGEXP) if orig_date_str
192
- return decade_matches.to_s.tr("'", '') if decade_matches
193
- end
194
- end
195
-
196
- CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
197
- CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
198
-
199
- # get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
200
- # note that these are the only century patterns found in our actual date strings in MODS records
201
- # @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
202
- def sortable_year_for_century
203
- return unless orig_date_str
204
- return if orig_date_str =~ /B\.C\./
205
-
206
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
207
- if century_matches
208
- return $1 + '00' if $1.length == 2
209
- return '0' + $1 + '00' if $1.length == 1
210
- end
211
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
212
- if century_str_matches
213
- yy = ($1.to_i - 1).to_s
214
- return yy + '00' if yy.length == 2
215
- return '0' + yy + '00' if yy.length == 1
65
+ date.to_s[0..3]&.gsub('X', '-')
216
66
  end
217
67
  end
218
-
219
- # get display value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
220
- # note that these are the only century patterns found in our actual date strings in MODS records
221
- # @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
222
- def display_str_for_century
223
- return unless orig_date_str
224
- return if orig_date_str =~ /B\.C\./
225
-
226
- century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
227
- return century_str_matches.to_s if century_str_matches
228
-
229
- century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
230
- if century_matches
231
- require 'active_support/core_ext/integer/inflections'
232
- return "#{($1.to_i + 1).ordinalize} century"
233
- end
234
- end
235
-
236
- BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
237
-
238
- # get String sortable value for B.C. if we have B.C. pattern
239
- # note that these values must *lexically* sort to create a chronological sort.
240
- # We know our data does not contain B.C. dates older than 999, so we can make them
241
- # lexically sort by subtracting 1000. So we get:
242
- # -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
243
- # @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
244
- def sortable_year_str_for_bc
245
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
246
- ($1.to_i - 1000).to_s if bc_matches
247
- end
248
-
249
- # get Integer sortable value for B.C. if we have B.C. pattern
250
- # @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
251
- def sortable_year_int_for_bc
252
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
253
- "-#{$1}".to_i if bc_matches
254
- end
255
-
256
- # get display value for B.C. if we have B.C. pattern
257
- # @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
258
- def display_str_for_bc
259
- bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
260
- bc_matches.to_s if bc_matches
261
- end
262
-
263
- EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
264
-
265
- # get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
266
- # note that these values must *lexically* sort to create a chronological sort.
267
- # We know our data does not contain negative dates older than -999, so we can make them
268
- # lexically sort by subtracting 1000. So we get:
269
- # -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
270
- # @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
271
- def sortable_year_str_for_early_numeric
272
- return unless orig_date_str.match(EARLY_NUMERIC)
273
-
274
- if orig_date_str =~ /^\-/
275
- # negative number becomes x - 1000 for sorting; -005 for -995
276
- num = orig_date_str[1..-1].to_i - 1000
277
- return '-' + num.to_s[1..-1].rjust(3, '0')
278
- else
279
- return orig_date_str.rjust(4, '0')
280
- end
281
- end
282
-
283
- # get Integer sortable value from date String containing yyy, yy, y, -y, -yy, -yyy, -yyyy
284
- # @return [Integer, nil] Integer sortable -ddd if orig_date_str matches pattern; nil otherwise
285
- def sortable_year_int_for_early_numeric
286
- return orig_date_str.to_i if orig_date_str.match(EARLY_NUMERIC)
287
-
288
- orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
289
- end
290
-
291
- # get display value for date String containing yyy, yy, y, -y, -yy, -yyy
292
- # negative number strings will be changed to B.C. strings
293
- # note that there is no year 0: from https://en.wikipedia.org/wiki/Anno_Domini
294
- # "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
295
- # There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
296
- # See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
297
- def display_str_for_early_numeric
298
- return unless orig_date_str.match(EARLY_NUMERIC)
299
- # return 1 B.C. when the date is 0 since there is no 0 year
300
- return '1 B.C.' if orig_date_str == '0'
301
- # negative number becomes B.C.
302
- return "#{orig_date_str[1..-1].to_i + 1} B.C." if orig_date_str =~ /^\-/
303
-
304
- # remove leading 0s from early dates
305
- "#{orig_date_str.to_i} A.D."
306
- end
307
-
308
- # NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
309
- # actually works for nearly all those cases and a lot more besides. Trial and error
310
- # with an extensive set of test data culled from actual date strings in our MODS records
311
- # has made this method bogus.
312
- # @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
313
- def year_via_ruby_parsing
314
- return unless orig_date_str =~ /\d\d/ # need at least 2 digits
315
- # need more in string than only 2 digits
316
- return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
317
- return if orig_date_str =~ /\d\s*B.C./ # skip B.C. dates
318
-
319
- date_obj = Date.parse(orig_date_str)
320
- date_obj.year.to_s
321
- rescue ArgumentError
322
- nil # explicitly want nil if date won't parse
323
- end
324
68
  end
325
69
  end
326
70
  end