stanford-mods 2.6.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +24 -0
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +21 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -288
  12. data/lib/stanford-mods/imprint.rb +149 -325
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +2 -0
  16. data/lib/stanford-mods.rb +13 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +238 -207
  22. data/spec/name_spec.rb +28 -232
  23. data/spec/origin_info_spec.rb +34 -300
  24. data/spec/searchworks_basic_spec.rb +1 -3
  25. data/spec/searchworks_pub_dates_spec.rb +0 -215
  26. data/spec/searchworks_spec.rb +0 -21
  27. data/spec/searchworks_subject_raw_spec.rb +106 -105
  28. data/spec/searchworks_subject_spec.rb +19 -55
  29. data/spec/searchworks_title_spec.rb +5 -5
  30. data/stanford-mods.gemspec +1 -1
  31. metadata +24 -20
  32. data/.travis.yml +0 -17
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,489 +0,0 @@
1
- require 'mods'
2
-
3
- # Parsing MODS /originInfo for Publication/Imprint data:
4
- # * pub year for date slider facet
5
- # * pub year for sorting
6
- # * pub year for single display value
7
- # * imprint info for display
8
- # *
9
- # These methods may be used by searchworks.rb file or by downstream apps
10
- module Stanford
11
- module Mods
12
- class Record < ::Mods::Record
13
- # return pub year as an Integer
14
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
15
- # look for a keyDate and use it if there is one; otherwise pick earliest date
16
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute) should be ignored; false if approximate dates should be included
17
- # @return [Integer] publication year as an Integer
18
- # @note for sorting: 5 B.C. => -5; 666 B.C. => -666
19
- def pub_year_int(ignore_approximate = false)
20
- single_pub_year(ignore_approximate, :year_int)
21
- end
22
-
23
- # return a single string intended for lexical sorting for pub date
24
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
25
- # look for a keyDate and use it if there is one; otherwise pick earliest date
26
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute) should be ignored; false if approximate dates should be included
27
- # @return [String] single String containing publication year for lexical sorting
28
- # @note for string sorting 5 B.C. = -5 => -995; 6 B.C. => -994, so 6 B.C. sorts before 5 B.C.
29
- # @deprecated use pub_year_int
30
- def pub_year_sort_str(ignore_approximate = false)
31
- single_pub_year(ignore_approximate, :year_sort_str)
32
- end
33
-
34
- # return a single string intended for display of pub year
35
- # 0 < year < 1000: add A.D. suffix
36
- # year < 0: add B.C. suffix. ('-5' => '5 B.C.', '700 B.C.' => '700 B.C.')
37
- # 195u => 195x
38
- # 19uu => 19xx
39
- # '-5' => '5 B.C.'
40
- # '700 B.C.' => '700 B.C.'
41
- # '7th century' => '7th century'
42
- # date ranges?
43
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
44
- # look for a keyDate and use it if there is one; otherwise pick earliest date
45
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
46
- # should be ignored; false if approximate dates should be included
47
- def pub_year_display_str(ignore_approximate = false)
48
- single_pub_year(ignore_approximate, :year_display_str)
49
-
50
- # TODO: want range displayed when start and end points
51
- # TODO: also want best year in year_isi fields
52
- # get_main_title_date
53
- # https://github.com/sul-dlss/SearchWorks/blob/7d4d870a9d450fed8b081c38dc3dbd590f0b706e/app/helpers/results_document_helper.rb#L8-L46
54
-
55
- # "publication_year_isi" => "Publication date", <-- do it already
56
- # "beginning_year_isi" => "Beginning date",
57
- # "earliest_year_isi" => "Earliest date",
58
- # "earliest_poss_year_isi" => "Earliest possible date",
59
- # "ending_year_isi" => "Ending date",
60
- # "latest_year_isi" => "Latest date",
61
- # "latest_poss_year_isi" => "Latest possible date",
62
- # "production_year_isi" => "Production date",
63
- # "original_year_isi" => "Original date",
64
- # "copyright_year_isi" => "Copyright date"} %>
65
-
66
- # "creation_year_isi" => "Creation date", <-- do it already
67
- # {}"release_year_isi" => "Release date",
68
- # {}"reprint_year_isi" => "Reprint/reissue date",
69
- # {}"other_year_isi" => "Date",
70
- end
71
-
72
- # @return [String] single String containing imprint information for display
73
- def imprint_display_str
74
- imp = Stanford::Mods::Imprint.new(origin_info)
75
- imp.display_str
76
- end
77
-
78
- # given the passed date elements, look for a single keyDate and use it if there is one;
79
- # otherwise pick earliest parseable date
80
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
81
- # @return [String] single String containing publication year for display
82
- def year_display_str(date_el_array)
83
- result = date_parsing_result(date_el_array, :date_str_for_display)
84
- return result if result
85
-
86
- _ignore, orig_str_to_parse = self.class.earliest_year_str(date_el_array)
87
- DateParsing.date_str_for_display(orig_str_to_parse) if orig_str_to_parse
88
- end
89
-
90
- # given the passed date elements, look for a single keyDate and use it if there is one;
91
- # otherwise pick earliest parseable date
92
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
93
- # @return [Integer] publication year as an Integer
94
- def year_int(date_el_array)
95
- result = date_parsing_result(date_el_array, :year_int_from_date_str)
96
- return result if result
97
-
98
- year_int, _ignore = self.class.earliest_year_int(date_el_array)
99
- year_int if year_int
100
- end
101
-
102
- # given the passed date elements, look for a single keyDate and use it if there is one;
103
- # otherwise pick earliest parseable date
104
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
105
- # @return [String] single String containing publication year for lexical sorting
106
- def year_sort_str(date_el_array)
107
- result = date_parsing_result(date_el_array, :sortable_year_string_from_date_str)
108
- return result if result
109
-
110
- sortable_str, _ignore = self.class.earliest_year_str(date_el_array)
111
- sortable_str if sortable_str
112
- end
113
-
114
- # return /originInfo/dateCreated elements in MODS records
115
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
116
- # should be excluded; false approximate dates should be included
117
- # @return [Array<Nokogiri::XML::Element>]
118
- def date_created_elements(ignore_approximate = false)
119
- date_created_nodeset = mods_ng_xml.origin_info.dateCreated
120
- return self.class.remove_approximate(date_created_nodeset) if ignore_approximate
121
-
122
- date_created_nodeset.to_a
123
- end
124
-
125
- # return /originInfo/dateIssued elements in MODS records
126
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
127
- # should be excluded; false approximate dates should be included
128
- # @return [Array<Nokogiri::XML::Element>]
129
- def date_issued_elements(ignore_approximate = false)
130
- date_issued_nodeset = mods_ng_xml.origin_info.dateIssued
131
- return self.class.remove_approximate(date_issued_nodeset) if ignore_approximate
132
-
133
- date_issued_nodeset.to_a
134
- end
135
-
136
- # given a set of date elements, return the single element with attribute keyDate="yes"
137
- # or return nil if no elements have attribute keyDate="yes", or if multiple elements have keyDate="yes"
138
- # @param [Array<Nokogiri::XML::Element>] Array of date elements
139
- # @return [Nokogiri::XML::Element, nil] single date element with attribute keyDate="yes", or nil
140
- def self.keyDate(elements)
141
- keyDates = elements.select { |node| node["keyDate"] == 'yes' }
142
- keyDates.first if keyDates.size == 1
143
- end
144
-
145
- # remove Elements from NodeSet if they have a qualifier attribute of 'approximate' or 'questionable'
146
- # @param [Nokogiri::XML::NodeSet<Nokogiri::XML::Element>] nodeset set of date elements
147
- # @return [Array<Nokogiri::XML::Element>] the set of date elements minus any that
148
- # had a qualifier attribute of 'approximate' or 'questionable'
149
- def self.remove_approximate(nodeset)
150
- nodeset.select { |node| node unless date_is_approximate?(node) }
151
- end
152
-
153
- # NOTE: legal values for MODS date elements with attribute qualifier are
154
- # 'approximate', 'inferred' or 'questionable'
155
- # @param [Nokogiri::XML::Element] date_element MODS date element
156
- # @return [Boolean] true if date_element has a qualifier attribute of "approximate" or "questionable",
157
- # false if no qualifier attribute, or if attribute is 'inferred' or some other value
158
- def self.date_is_approximate?(date_element)
159
- qualifier = date_element["qualifier"] if date_element.respond_to?('[]')
160
- qualifier == 'approximate' || qualifier == 'questionable'
161
- end
162
-
163
- # get earliest parseable year (as an Integer) from the passed date elements
164
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
165
- # @return two String values:
166
- # the first is the Integer value of the earliest year;
167
- # the second is the original String value of the chosen element
168
- def self.earliest_year_int(date_el_array)
169
- earliest_year(date_el_array, :year_int_from_date_str)
170
- end
171
-
172
- # get earliest parseable year (as a String) from the passed date elements
173
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
174
- # @return two String values:
175
- # the first is the lexically sortable String value of the earliest year;
176
- # the second is the original String value of the chosen element
177
- def self.earliest_year_str(date_el_array)
178
- earliest_year(date_el_array, :sortable_year_string_from_date_str)
179
- end
180
-
181
- # return a single value intended for pub date flavor indicated by method_sym
182
- # prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
183
- # look for a keyDate and use it if there is one; otherwise pick earliest date
184
- # @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
185
- # should be ignored; false if approximate dates should be included
186
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
187
- # @return [String, Integer] publication year as String or Integer
188
- def single_pub_year(ignore_approximate, method_sym)
189
- result = send(method_sym, date_issued_elements(ignore_approximate))
190
- result ||= send(method_sym, date_created_elements(ignore_approximate))
191
- # dateCaptured for web archive seed records
192
- result || send(method_sym, mods_ng_xml.origin_info.dateCaptured.to_a)
193
- end
194
-
195
- # given the passed date elements, look for a single keyDate and use it if there is one;
196
- # otherwise pick earliest parseable date
197
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
198
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
199
- # @return [Integer, String] year as a String or Integer, depending on method_sym
200
- def date_parsing_result(date_el_array, method_sym)
201
- return if date_el_array.empty?
202
-
203
- # prefer keyDate
204
- key_date_el = self.class.keyDate(date_el_array)
205
- DateParsing.send(method_sym, key_date_el.content) if key_date_el
206
- end
207
- # temporarily use this technique to mark methods private until we get rid of old date parsing methods below
208
- private :single_pub_year, :date_parsing_result
209
-
210
- class << self
211
- private
212
-
213
- # get earliest parseable year from the passed date elements
214
- # @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
215
- # @param [Symbol] method_sym method name in DateParsing, as a symbol
216
- # @return [Array<String,Integer>] two values: earliest date and the original element string
217
- # - first is earliest date either as lexically sortable String value or the Integer, depending on method_sym
218
- # - second is the original String value of the chosen element
219
- def earliest_year(date_el_array, method_sym)
220
- poss_results = {}
221
- date_el_array.each { |el|
222
- result = DateParsing.send(method_sym, el.content)
223
- poss_results[result] = el.content if result
224
- }
225
- earliest = poss_results.keys.sort.first if poss_results.present?
226
- return earliest, poss_results[earliest] if earliest
227
- end
228
- end
229
-
230
- # ---- old date parsing methods used downstream of gem; will be deprecated/replaced with new date parsing methods
231
-
232
- def place
233
- term_values([:origin_info, :place, :placeTerm])
234
- end
235
-
236
- # Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
237
- # Jan 2016: used to populate Solr pub_date field for Spotlight and SearchWorks
238
- # Spotlight: pub_date field should be replaced by pub_year_w_approx_isi and pub_year_no_approx_isi
239
- # SearchWorks: pub_date field used for display in search results and show view; for sorting nearby-on-shelf
240
- # these could be done with more approp fields/methods (pub_year_int for sorting; new pub year methods to populate field)
241
- # TODO: prob should deprecate this in favor of pub_year_display_str;
242
- # need head-to-head testing with pub_year_display_str
243
- # @return [String] value for the pub date facet
244
- def pub_date_facet
245
- return nil unless pub_date
246
- return "#{pub_date.to_i + 1000} B.C." if pub_date.start_with?('-')
247
- return pub_date unless pub_date.include? '--'
248
-
249
- "#{pub_date[0, 2].to_i + 1}th century"
250
- end
251
-
252
- # creates a date suitable for sorting. Guarnteed to be 4 digits or nil
253
- # @deprecated use pub_year_int, or pub_year_sort_str if you must have a string (why?)
254
- def pub_date_sort
255
- if pub_date
256
- pd = pub_date
257
- pd = '0' + pd if pd.length == 3
258
- pd = pd.gsub('--', '00')
259
- end
260
- fail "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd && pd.length != 4
261
-
262
- pd
263
- end
264
-
265
- # For the date display only, the first place to look is in the dates without encoding=marc array.
266
- # If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil
267
- # @return [String] value for the pub_date_display Solr field for this document or nil if none
268
- # @deprecated DO NOT USE: this is no longer used in SW, Revs or Spotlight Jan 2016
269
- def pub_date_display
270
- return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
271
- return dates_marc_encoding.first unless dates_marc_encoding.empty?
272
-
273
- nil
274
- end
275
-
276
- # old date parsing protected methods to be deprecated/replaced with new methods (see also DateParsing)
277
-
278
- protected
279
-
280
- # The year the object was published
281
- # @return [String] 4 character year or nil if no valid date was found
282
- def pub_year
283
- # use the cached year if there is one
284
- if @pub_year
285
- return nil if @pub_year == ''
286
-
287
- return @pub_year
288
- end
289
-
290
- dates = pub_dates.map do |f_date|
291
- # remove ? and []
292
- if f_date.length == 4 && f_date.end_with?('?')
293
- f_date.tr('?', '0')
294
- else
295
- f_date.delete('?[]')
296
- end
297
- end
298
-
299
- if dates
300
- # try to find a date starting with the most normal date formats and progressing to more wonky ones
301
- @pub_year = get_plain_four_digit_year(dates) ||
302
- get_u_year(dates) || # Check for years in u notation, e.g., 198u
303
- get_double_digit_century(dates) ||
304
- get_bc_year(dates) ||
305
- get_three_digit_year(dates) ||
306
- get_single_digit_century(dates)
307
- return @pub_year if @pub_year
308
- end
309
- @pub_year = ''
310
- nil
311
- end
312
- alias_method :pub_date, :pub_year
313
-
314
- # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
315
- # If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil
316
- # @return [Array<String>] values for the date Solr field for this document or nil if none
317
- def pub_dates
318
- return dates_marc_encoding unless dates_marc_encoding.empty?
319
- return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
320
-
321
- nil
322
- end
323
-
324
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
325
- def dates_marc_encoding
326
- @dates_marc_encoding ||= begin
327
- parse_dates_from_originInfo
328
- @dates_marc_encoding
329
- end
330
- end
331
-
332
- # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
333
- def dates_no_marc_encoding
334
- @dates_no_marc_encoding ||= begin
335
- parse_dates_from_originInfo
336
- @dates_no_marc_encoding
337
- end
338
- end
339
-
340
- # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
341
- # with and without encoding=marc
342
- def parse_dates_from_originInfo
343
- @dates_marc_encoding = []
344
- @dates_no_marc_encoding = []
345
- origin_info.dateIssued.each { |di|
346
- if di.encoding == "marc"
347
- @dates_marc_encoding << di.text
348
- else
349
- @dates_no_marc_encoding << di.text
350
- end
351
- }
352
- origin_info.dateCreated.each { |dc|
353
- if dc.encoding == "marc"
354
- @dates_marc_encoding << dc.text
355
- else
356
- @dates_no_marc_encoding << dc.text
357
- end
358
- }
359
- end
360
-
361
- def is_number?(object)
362
- true if Integer(object) rescue false
363
- end
364
-
365
- def is_date?(object)
366
- true if Date.parse(object) rescue false
367
- end
368
-
369
- # get a 4 digit year like 1865 from array of dates
370
- # @param [Array<String>] dates an array of potential year strings
371
- def get_plain_four_digit_year(dates)
372
- dates.each do |f_date|
373
- matches = f_date.scan(/\d{4}/)
374
- if matches.length == 1
375
- @pub_year = matches.first
376
- else
377
- # when there are multiple matches, check for ones with CE after them
378
- matches.each do |match|
379
- # look for things like '1865-6 CE'
380
- pos = f_date.index(Regexp.new(match + '...CE'))
381
- pos = pos ? pos.to_i : 0
382
- if f_date.include?(match + ' CE') || pos > 0
383
- @pub_year = match
384
- return match
385
- end
386
- end
387
- end
388
- return matches.first
389
- end
390
- nil
391
- end
392
-
393
- # get a 3 digit year like 965 from the date array
394
- # @param [Array<String>] dates an array of potential year strings
395
- def get_three_digit_year(dates)
396
- dates.each do |f_date|
397
- matches = f_date.scan(/\d{3}/)
398
- return matches.first unless matches.empty?
399
- end
400
- nil
401
- end
402
-
403
- # get the 3 digit BC year, return it as a negative, so -700 for 300 BC.
404
- # Other methods will translate it to proper display, this is good for sorting.
405
- # @param [Array<String>] dates an array of potential year strings
406
- def get_bc_year(dates)
407
- dates.each do |f_date|
408
- matches = f_date.scan(/\d{3} B.C./)
409
- unless matches.empty?
410
- bc_year = matches.first[0..2]
411
- return (bc_year.to_i - 1000).to_s
412
- end
413
- end
414
- nil
415
- end
416
-
417
- # get a single digit century like '9th century' from the date array
418
- # @param [Array<String>] dates an array of potential year strings
419
- # @return [String] y-- if we identify century digit in string
420
- def get_single_digit_century(dates)
421
- dates.each do |f_date|
422
- matches = f_date.scan(/\d{1}th/)
423
- next if matches.empty?
424
-
425
- if matches.length == 1
426
- @pub_year = (matches.first[0, 2].to_i - 1).to_s + '--'
427
- return @pub_year
428
- else
429
- # when there are multiple matches, check for ones with CE after them
430
- matches.each do |match|
431
- pos = f_date.index(Regexp.new(match + '...CE'))
432
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
433
- pos = pos ? pos.to_i : 0
434
- if f_date.include?(match + ' CE') || pos > 0
435
- @pub_year = (match[0, 1].to_i - 1).to_s + '--'
436
- return @pub_year
437
- end
438
- end
439
- end
440
- end
441
- nil
442
- end
443
-
444
- # get a double digit century like '12th century' from the date array
445
- # @param [Array<String>] dates an array of potential year strings
446
- # @return [String] yy-- if we identify century digits in string
447
- def get_double_digit_century(dates)
448
- dates.each do |f_date|
449
- matches = f_date.scan(/\d{2}th/)
450
- next if matches.empty?
451
-
452
- if matches.length == 1
453
- @pub_year = (matches.first[0, 2].to_i - 1).to_s + '--'
454
- return @pub_year
455
- else
456
- # when there are multiple matches, check for ones with CE after them
457
- matches.each do |match|
458
- pos = f_date.index(Regexp.new(match + '...CE'))
459
- pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
460
- pos = pos ? pos.to_i : 0
461
- if f_date.include?(match + ' CE') || pos > 0
462
- @pub_year = (match[0, 2].to_i - 1).to_s + '--'
463
- return @pub_year
464
- end
465
- end
466
- end
467
- end
468
- nil
469
- end
470
-
471
- # If a year has a "u" in it, replace u with 0 for yyyu (becomes yyy0)
472
- # and replace u with '-' for yyuu (becomes yy--)
473
- # @param [String] dates looking for matches on yyyu or yyuu in these strings
474
- # @return [String, nil] String of format yyy0 or yy--, or nil
475
- def get_u_year(dates)
476
- dates.each do |f_date|
477
- # Single digit u notation
478
- matches = f_date.scan(/\d{3}u/)
479
- return matches.first.tr('u', '0') if matches.length == 1
480
-
481
- # Double digit u notation
482
- matches = f_date.scan(/\d{2}u{2}/)
483
- return matches.first.tr('u', '-') if matches.length == 1
484
- end
485
- nil
486
- end
487
- end # class Record
488
- end
489
- end