stanford-mods 2.6.4 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +1 -1
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +24 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -289
  12. data/lib/stanford-mods/imprint.rb +170 -322
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
  16. data/lib/stanford-mods.rb +12 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +263 -207
  22. data/spec/lib/stanford-mods/coordinate_spec.rb +3 -5
  23. data/spec/name_spec.rb +26 -230
  24. data/spec/origin_info_spec.rb +34 -300
  25. data/spec/searchworks_basic_spec.rb +1 -3
  26. data/spec/searchworks_pub_dates_spec.rb +0 -215
  27. data/spec/searchworks_spec.rb +0 -21
  28. data/spec/searchworks_subject_raw_spec.rb +106 -105
  29. data/spec/searchworks_subject_spec.rb +19 -55
  30. data/spec/searchworks_title_spec.rb +5 -5
  31. data/stanford-mods.gemspec +1 -1
  32. metadata +19 -15
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,489 +0,0 @@
1
- require 'mods'
2
-
3
- # Parsing MODS /originInfo for Publication/Imprint data:
4
- # * pub year for date slider facet
5
- # * pub year for sorting
6
- # * pub year for single display value
7
- # * imprint info for display
8
- # *
9
- # These methods may be used by searchworks.rb file or by downstream apps
10
- module Stanford
11
- module Mods
12
- class Record < ::Mods::Record
13
- # return pub year as an Integer
14
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
15
- # look for a keyDate and use it if there is one; otherwise pick earliest date
16
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute) should be ignored; false if approximate dates should be included
17
- # @return [Integer] publication year as an Integer
18
- # @note for sorting: 5 B.C. => -5; 666 B.C. => -666
19
- def pub_year_int(ignore_approximate = false)
20
- single_pub_year(ignore_approximate, :year_int)
21
- end
22
-
23
- # return a single string intended for lexical sorting for pub date
24
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
25
- # look for a keyDate and use it if there is one; otherwise pick earliest date
26
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute) should be ignored; false if approximate dates should be included
27
- # @return [String] single String containing publication year for lexical sorting
28
- # @note for string sorting 5 B.C. = -5 => -995; 6 B.C. => -994, so 6 B.C. sorts before 5 B.C.
29
- # @deprecated use pub_year_int
30
- def pub_year_sort_str(ignore_approximate = false)
31
- single_pub_year(ignore_approximate, :year_sort_str)
32
- end
33
-
34
- # return a single string intended for display of pub year
35
- # 0 < year < 1000: add A.D. suffix
36
- # year < 0: add B.C. suffix. ('-5' => '5 B.C.', '700 B.C.' => '700 B.C.')
37
- # 195u => 195x
38
- # 19uu => 19xx
39
- # '-5' => '5 B.C.'
40
- # '700 B.C.' => '700 B.C.'
41
- # '7th century' => '7th century'
42
- # date ranges?
43
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
44
- # look for a keyDate and use it if there is one; otherwise pick earliest date
45
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
46
- # should be ignored; false if approximate dates should be included
47
- def pub_year_display_str(ignore_approximate = false)
48
- single_pub_year(ignore_approximate, :year_display_str)
49
-
50
- # TODO: want range displayed when start and end points
51
- # TODO: also want best year in year_isi fields
52
- # get_main_title_date
53
- # https://github.com/sul-dlss/SearchWorks/blob/7d4d870a9d450fed8b081c38dc3dbd590f0b706e/app/helpers/results_document_helper.rb#L8-L46
54
-
55
- # "publication_year_isi" => "Publication date", <-- do it already
56
- # "beginning_year_isi" => "Beginning date",
57
- # "earliest_year_isi" => "Earliest date",
58
- # "earliest_poss_year_isi" => "Earliest possible date",
59
- # "ending_year_isi" => "Ending date",
60
- # "latest_year_isi" => "Latest date",
61
- # "latest_poss_year_isi" => "Latest possible date",
62
- # "production_year_isi" => "Production date",
63
- # "original_year_isi" => "Original date",
64
- # "copyright_year_isi" => "Copyright date"} %>
65
-
66
- # "creation_year_isi" => "Creation date", <-- do it already
67
- # {}"release_year_isi" => "Release date",
68
- # {}"reprint_year_isi" => "Reprint/reissue date",
69
- # {}"other_year_isi" => "Date",
70
- end
71
-
72
- # @return [String] single String containing imprint information for display
73
- def imprint_display_str
74
- imp = Stanford::Mods::Imprint.new(origin_info)
75
- imp.display_str
76
- end
77
-
78
- # given the passed date elements, look for a single keyDate and use it if there is one;
79
- # otherwise pick earliest parseable date
80
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
81
- # @return [String] single String containing publication year for display
82
- def year_display_str(date_el_array)
83
- result = date_parsing_result(date_el_array, :date_str_for_display)
84
- return result if result
85
-
86
- _ignore, orig_str_to_parse = self.class.earliest_year_str(date_el_array)
87
- DateParsing.date_str_for_display(orig_str_to_parse) if orig_str_to_parse
88
- end
89
-
90
- # given the passed date elements, look for a single keyDate and use it if there is one;
91
- # otherwise pick earliest parseable date
92
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
93
- # @return [Integer] publication year as an Integer
94
- def year_int(date_el_array)
95
- result = date_parsing_result(date_el_array, :year_int_from_date_str)
96
- return result if result
97
-
98
- year_int, _ignore = self.class.earliest_year_int(date_el_array)
99
- year_int if year_int
100
- end
101
-
102
- # given the passed date elements, look for a single keyDate and use it if there is one;
103
- # otherwise pick earliest parseable date
104
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
105
- # @return [String] single String containing publication year for lexical sorting
106
- def year_sort_str(date_el_array)
107
- result = date_parsing_result(date_el_array, :sortable_year_string_from_date_str)
108
- return result if result
109
-
110
- sortable_str, _ignore = self.class.earliest_year_str(date_el_array)
111
- sortable_str if sortable_str
112
- end
113
-
114
- # return /originInfo/dateCreated elements in MODS records
115
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
116
- # should be excluded; false approximate dates should be included
117
- # @return [Array<Nokogiri::XML::Element>]
118
- def date_created_elements(ignore_approximate = false)
119
- date_created_nodeset = mods_ng_xml.origin_info.dateCreated
120
- return self.class.remove_approximate(date_created_nodeset) if ignore_approximate
121
-
122
- date_created_nodeset.to_a
123
- end
124
-
125
- # return /originInfo/dateIssued elements in MODS records
126
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
127
- # should be excluded; false approximate dates should be included
128
- # @return [Array<Nokogiri::XML::Element>]
129
- def date_issued_elements(ignore_approximate = false)
130
- date_issued_nodeset = mods_ng_xml.origin_info.dateIssued
131
- return self.class.remove_approximate(date_issued_nodeset) if ignore_approximate
132
-
133
- date_issued_nodeset.to_a
134
- end
135
-
136
- # given a set of date elements, return the single element with attribute keyDate="yes"
137
- # or return nil if no elements have attribute keyDate="yes", or if multiple elements have keyDate="yes"
138
- # @param [Array<Nokogiri::XML::Element>] Array of date elements
139
- # @return [Nokogiri::XML::Element, nil] single date element with attribute keyDate="yes", or nil
140
- def self.keyDate(elements)
141
- keyDates = elements.select { |node| node["keyDate"] == 'yes' }
142
- keyDates.first if keyDates.size == 1
143
- end
144
-
145
- # remove Elements from NodeSet if they have a qualifier attribute of 'approximate' or 'questionable'
146
- # @param [Nokogiri::XML::NodeSet<Nokogiri::XML::Element>] nodeset set of date elements
147
- # @return [Array<Nokogiri::XML::Element>] the set of date elements minus any that
148
- # had a qualifier attribute of 'approximate' or 'questionable'
149
- def self.remove_approximate(nodeset)
150
- nodeset.select { |node| node unless date_is_approximate?(node) }
151
- end
152
-
153
- # NOTE: legal values for MODS date elements with attribute qualifier are
154
- # 'approximate', 'inferred' or 'questionable'
155
- # @param [Nokogiri::XML::Element] date_element MODS date element
156
- # @return [Boolean] true if date_element has a qualifier attribute of "approximate" or "questionable",
157
- # false if no qualifier attribute, or if attribute is 'inferred' or some other value
158
- def self.date_is_approximate?(date_element)
159
- qualifier = date_element["qualifier"] if date_element.respond_to?('[]')
160
- qualifier == 'approximate' || qualifier == 'questionable'
161
- end
162
-
163
- # get earliest parseable year (as an Integer) from the passed date elements
164
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
165
- # @return two String values:
166
- # the first is the Integer value of the earliest year;
167
- # the second is the original String value of the chosen element
168
- def self.earliest_year_int(date_el_array)
169
- earliest_year(date_el_array, :year_int_from_date_str)
170
- end
171
-
172
- # get earliest parseable year (as a String) from the passed date elements
173
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
174
- # @return two String values:
175
- # the first is the lexically sortable String value of the earliest year;
176
- # the second is the original String value of the chosen element
177
- def self.earliest_year_str(date_el_array)
178
- earliest_year(date_el_array, :sortable_year_string_from_date_str)
179
- end
180
-
181
- # return a single value intended for pub date flavor indicated by method_sym
182
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
183
- # look for a keyDate and use it if there is one; otherwise pick earliest date
184
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
185
- # should be ignored; false if approximate dates should be included
186
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
187
- # @return [String, Integer] publication year as String or Integer
188
- def single_pub_year(ignore_approximate, method_sym)
189
- result = send(method_sym, date_issued_elements(ignore_approximate))
190
- result ||= send(method_sym, date_created_elements(ignore_approximate))
191
- # dateCaptured for web archive seed records
192
- result || send(method_sym, mods_ng_xml.origin_info.dateCaptured.to_a)
193
- end
194
-
195
- # given the passed date elements, look for a single keyDate and use it if there is one;
196
- # otherwise pick earliest parseable date
197
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
198
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
199
- # @return [Integer, String] year as a String or Integer, depending on method_sym
200
- def date_parsing_result(date_el_array, method_sym)
201
- return if date_el_array.empty?
202
-
203
- # prefer keyDate
204
- key_date_el = self.class.keyDate(date_el_array)
205
- DateParsing.send(method_sym, key_date_el.content) if key_date_el
206
- end
207
- # temporarily use this technique to mark methods private until we get rid of old date parsing methods below
208
- private :single_pub_year, :date_parsing_result
209
-
210
- class << self
211
- private
212
-
213
- # get earliest parseable year from the passed date elements
214
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
215
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
216
- # @return [Array<String,Integer>] two values: earliest date and the original element string
217
- # - first is earliest date either as lexically sortable String value or the Integer, depending on method_sym
218
- # - second is the original String value of the chosen element
219
- def earliest_year(date_el_array, method_sym)
220
- poss_results = {}
221
- date_el_array.each { |el|
222
- result = DateParsing.send(method_sym, el.content)
223
- poss_results[result] = el.content if result
224
- }
225
- earliest = poss_results.keys.sort.first if poss_results.present?
226
- return earliest, poss_results[earliest] if earliest
227
- end
228
- end
229
-
230
- # ---- old date parsing methods used downstream of gem; will be deprecated/replaced with new date parsing methods
231
-
232
- def place
233
- term_values([:origin_info, :place, :placeTerm])
234
- end
235
-
236
- # Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
237
- # Jan 2016: used to populate Solr pub_date field for Spotlight and SearchWorks
238
- # Spotlight: pub_date field should be replaced by pub_year_w_approx_isi and pub_year_no_approx_isi
239
- # SearchWorks: pub_date field used for display in search results and show view; for sorting nearby-on-shelf
240
- # these could be done with more approp fields/methods (pub_year_int for sorting; new pub year methods to populate field)
241
- # TODO: prob should deprecate this in favor of pub_year_display_str;
242
- # need head-to-head testing with pub_year_display_str
243
- # @return [String] value for the pub date facet
244
- def pub_date_facet
245
- return nil unless pub_date
246
- return "#{pub_date.to_i + 1000} B.C." if pub_date.start_with?('-')
247
- return pub_date unless pub_date.include? '--'
248
-
249
- "#{pub_date[0, 2].to_i + 1}th century"
250
- end
251
-
252
- # creates a date suitable for sorting. Guarnteed to be 4 digits or nil
253
- # @deprecated use pub_year_int, or pub_year_sort_str if you must have a string (why?)
254
- def pub_date_sort
255
- if pub_date
256
- pd = pub_date
257
- pd = '0' + pd if pd.length == 3
258
- pd = pd.gsub('--', '00')
259
- end
260
- fail "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd && pd.length != 4
261
-
262
- pd
263
- end
264
-
265
- # For the date display only, the first place to look is in the dates without encoding=marc array.
266
- # If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil
267
- # @return [String] value for the pub_date_display Solr field for this document or nil if none
268
- # @deprecated DO NOT USE: this is no longer used in SW, Revs or Spotlight Jan 2016
269
- def pub_date_display
270
- return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
271
- return dates_marc_encoding.first unless dates_marc_encoding.empty?
272
-
273
- nil
274
- end
275
-
276
- # old date parsing protected methods to be deprecated/replaced with new methods (see also DateParsing)
277
-
278
- protected
279
-
280
- # The year the object was published
281
- # @return [String] 4 character year or nil if no valid date was found
282
- def pub_year
283
- # use the cached year if there is one
284
- if @pub_year
285
- return nil if @pub_year == ''
286
-
287
- return @pub_year
288
- end
289
-
290
- dates = pub_dates.map do |f_date|
291
- # remove ? and []
292
- if f_date.length == 4 && f_date.end_with?('?')
293
- f_date.tr('?', '0')
294
- else
295
- f_date.delete('?[]')
296
- end
297
- end
298
-
299
- if dates
300
- # try to find a date starting with the most normal date formats and progressing to more wonky ones
301
- @pub_year = get_plain_four_digit_year(dates) ||
302
- get_u_year(dates) || # Check for years in u notation, e.g., 198u
303
- get_double_digit_century(dates) ||
304
- get_bc_year(dates) ||
305
- get_three_digit_year(dates) ||
306
- get_single_digit_century(dates)
307
- return @pub_year if @pub_year
308
- end
309
- @pub_year = ''
310
- nil
311
- end
312
- alias_method :pub_date, :pub_year
313
-
314
- # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
315
- # If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil
316
- # @return [Array<String>] values for the date Solr field for this document or nil if none
317
- def pub_dates
318
- return dates_marc_encoding unless dates_marc_encoding.empty?
319
- return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
320
-
321
- nil
322
- end
323
-
324
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
325
- def dates_marc_encoding
326
- @dates_marc_encoding ||= begin
327
- parse_dates_from_originInfo
328
- @dates_marc_encoding
329
- end
330
- end
331
-
332
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
333
- def dates_no_marc_encoding
334
- @dates_no_marc_encoding ||= begin
335
- parse_dates_from_originInfo
336
- @dates_no_marc_encoding
337
- end
338
- end
339
-
340
- # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
341
- # with and without encoding=marc
342
- def parse_dates_from_originInfo
343
- @dates_marc_encoding = []
344
- @dates_no_marc_encoding = []
345
- origin_info.dateIssued.each { |di|
346
- if di.encoding == "marc"
347
- @dates_marc_encoding << di.text
348
- else
349
- @dates_no_marc_encoding << di.text
350
- end
351
- }
352
- origin_info.dateCreated.each { |dc|
353
- if dc.encoding == "marc"
354
- @dates_marc_encoding << dc.text
355
- else
356
- @dates_no_marc_encoding << dc.text
357
- end
358
- }
359
- end
360
-
361
- def is_number?(object)
362
- true if Integer(object) rescue false
363
- end
364
-
365
- def is_date?(object)
366
- true if Date.parse(object) rescue false
367
- end
368
-
369
- # get a 4 digit year like 1865 from array of dates
370
- # @param [Array<String>] dates an array of potential year strings
371
- def get_plain_four_digit_year(dates)
372
- dates.each do |f_date|
373
- matches = f_date.scan(/\d{4}/)
374
- if matches.length == 1
375
- @pub_year = matches.first
376
- else
377
- # when there are multiple matches, check for ones with CE after them
378
- matches.each do |match|
379
- # look for things like '1865-6 CE'
380
- pos = f_date.index(Regexp.new(match + '...CE'))
381
- pos = pos ? pos.to_i : 0
382
- if f_date.include?(match + ' CE') || pos > 0
383
- @pub_year = match
384
- return match
385
- end
386
- end
387
- end
388
- return matches.first
389
- end
390
- nil
391
- end
392
-
393
- # get a 3 digit year like 965 from the date array
394
- # @param [Array<String>] dates an array of potential year strings
395
- def get_three_digit_year(dates)
396
- dates.each do |f_date|
397
- matches = f_date.scan(/\d{3}/)
398
- return matches.first unless matches.empty?
399
- end
400
- nil
401
- end
402
-
403
- # get the 3 digit BC year, return it as a negative, so -700 for 300 BC.
404
- # Other methods will translate it to proper display, this is good for sorting.
405
- # @param [Array<String>] dates an array of potential year strings
406
- def get_bc_year(dates)
407
- dates.each do |f_date|
408
- matches = f_date.scan(/\d{3} B.C./)
409
- unless matches.empty?
410
- bc_year = matches.first[0..2]
411
- return (bc_year.to_i - 1000).to_s
412
- end
413
- end
414
- nil
415
- end
416
-
417
- # get a single digit century like '9th century' from the date array
418
- # @param [Array<String>] dates an array of potential year strings
419
- # @return [String] y-- if we identify century digit in string
420
- def get_single_digit_century(dates)
421
- dates.each do |f_date|
422
- matches = f_date.scan(/\d{1}th/)
423
- next if matches.empty?
424
-
425
- if matches.length == 1
426
- @pub_year = (matches.first[0, 2].to_i - 1).to_s + '--'
427
- return @pub_year
428
- else
429
- # when there are multiple matches, check for ones with CE after them
430
- matches.each do |match|
431
- pos = f_date.index(Regexp.new(match + '...CE'))
432
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
433
- pos = pos ? pos.to_i : 0
434
- if f_date.include?(match + ' CE') || pos > 0
435
- @pub_year = (match[0, 1].to_i - 1).to_s + '--'
436
- return @pub_year
437
- end
438
- end
439
- end
440
- end
441
- nil
442
- end
443
-
444
- # get a double digit century like '12th century' from the date array
445
- # @param [Array<String>] dates an array of potential year strings
446
- # @return [String] yy-- if we identify century digits in string
447
- def get_double_digit_century(dates)
448
- dates.each do |f_date|
449
- matches = f_date.scan(/\d{2}th/)
450
- next if matches.empty?
451
-
452
- if matches.length == 1
453
- @pub_year = (matches.first[0, 2].to_i - 1).to_s + '--'
454
- return @pub_year
455
- else
456
- # when there are multiple matches, check for ones with CE after them
457
- matches.each do |match|
458
- pos = f_date.index(Regexp.new(match + '...CE'))
459
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
460
- pos = pos ? pos.to_i : 0
461
- if f_date.include?(match + ' CE') || pos > 0
462
- @pub_year = (match[0, 2].to_i - 1).to_s + '--'
463
- return @pub_year
464
- end
465
- end
466
- end
467
- end
468
- nil
469
- end
470
-
471
- # If a year has a "u" in it, replace u with 0 for yyyu (becomes yyy0)
472
- # and replace u with '-' for yyuu (becomes yy--)
473
- # @param [String] dates looking for matches on yyyu or yyuu in these strings
474
- # @return [String, nil] String of format yyy0 or yy--, or nil
475
- def get_u_year(dates)
476
- dates.each do |f_date|
477
- # Single digit u notation
478
- matches = f_date.scan(/\d{3}u/)
479
- return matches.first.tr('u', '0') if matches.length == 1
480
-
481
- # Double digit u notation
482
- matches = f_date.scan(/\d{2}u{2}/)
483
- return matches.first.tr('u', '-') if matches.length == 1
484
- end
485
- nil
486
- end
487
- end # class Record
488
- end
489
- end