stanford-mods 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rspec +1 -0
- data/.rubocop.yml +4 -0
- data/Gemfile +1 -0
- data/lib/stanford-mods.rb +5 -5
- data/lib/stanford-mods/date_parsing.rb +245 -0
- data/lib/stanford-mods/origin_info.rb +411 -0
- data/lib/stanford-mods/searchworks.rb +23 -474
- data/lib/stanford-mods/searchworks_subjects.rb +208 -0
- data/lib/stanford-mods/version.rb +1 -1
- data/spec/date_parsing_spec.rb +746 -0
- data/spec/fixtures/spotlight_pub_date_data.rb +316 -0
- data/spec/origin_info_spec.rb +449 -0
- data/spec/searchworks_pub_dates_spec.rb +166 -163
- data/spec/spec_helper.rb +16 -5
- data/stanford-mods.gemspec +2 -0
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ea1bae95a44c285bf8594fa40a73c2aa0b328d1
|
4
|
+
data.tar.gz: 479f7c52ae3c7c29592a870a819e10e6e1abb692
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6f535642244577ecfb2f7cc2d4b4291c9a7cb052869543cf989f6cce72cb5b8609e668e4ace238940c2c252b8613e63908b6627aef608b5e716068b52de23f5
|
7
|
+
data.tar.gz: 97db3a6affbc9e74b62432961d712ace6f860cbf77b36137f22287f2e3c4f491d5cbfeeb24a540bd7c48ed21e38e91663d0dd9436e2cc62e7835bc50bd319a11
|
data/.gitignore
CHANGED
data/.rspec
CHANGED
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
data/lib/stanford-mods.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require 'stanford-mods/version'
|
2
1
|
require 'mods'
|
2
|
+
require 'stanford-mods/date_parsing'
|
3
|
+
require 'stanford-mods/geo_spatial'
|
3
4
|
require 'stanford-mods/name'
|
4
|
-
require 'stanford-mods/
|
5
|
+
require 'stanford-mods/origin_info'
|
5
6
|
require 'stanford-mods/physical_location'
|
6
|
-
require 'stanford-mods/
|
7
|
+
require 'stanford-mods/searchworks'
|
8
|
+
require 'stanford-mods/version'
|
7
9
|
|
8
10
|
# Stanford specific wranglings of MODS metadata as an extension of the Mods::Record object
|
9
11
|
module Stanford
|
10
12
|
module Mods
|
11
|
-
|
12
13
|
class Record < ::Mods::Record
|
13
|
-
|
14
14
|
end # Record class
|
15
15
|
end # Mods module
|
16
16
|
end # Stanford module
|
@@ -0,0 +1,245 @@
|
|
1
|
+
module Stanford
|
2
|
+
module Mods
|
3
|
+
# Parsing date strings
|
4
|
+
# TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
|
5
|
+
# When this is "gemified":
|
6
|
+
# - we may want an integer or date sort field as well as lexical
|
7
|
+
# - we could add methods like my_date.bc?
|
8
|
+
class DateParsing
|
9
|
+
|
10
|
+
# get single facet value for date, generally an explicit year or "17th century" or "5 B.C."
|
11
|
+
# returns '845', not 0845
|
12
|
+
# @param [String] date_str String containing a date (we hope)
|
13
|
+
# @return [String, nil] String facet value for year if we could parse one, nil otherwise
|
14
|
+
def self.facet_string_from_date_str(date_str)
|
15
|
+
return DateParsing.new(date_str).facet_string_from_date_str
|
16
|
+
end
|
17
|
+
|
18
|
+
# get String sortable value year if we can parse date_str to get a year.
|
19
|
+
# SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
|
20
|
+
# The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
|
21
|
+
# @param [String] date_str String containing a date (we hope)
|
22
|
+
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
23
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
24
|
+
def self.sortable_year_string_from_date_str(date_str)
|
25
|
+
return DateParsing.new(date_str).sortable_year_string_from_date_str
|
26
|
+
end
|
27
|
+
|
28
|
+
# true if the year is between -999 and (current year + 1)
|
29
|
+
# @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
|
30
|
+
# @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
|
31
|
+
def self.year_str_valid?(year_str)
|
32
|
+
return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
|
33
|
+
(-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
|
34
|
+
end
|
35
|
+
|
36
|
+
attr_reader :orig_date_str
|
37
|
+
|
38
|
+
def initialize(date_str)
|
39
|
+
@orig_date_str = date_str
|
40
|
+
@orig_date_str.freeze
|
41
|
+
end
|
42
|
+
|
43
|
+
BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
|
44
|
+
|
45
|
+
# get single facet value for date, generally an explicit year or "17th century" or "5 B.C."
|
46
|
+
# @return [String, nil] String facet value for year if we could parse one, nil otherwise
|
47
|
+
def facet_string_from_date_str
|
48
|
+
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
49
|
+
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
50
|
+
return facet_string_for_bc if orig_date_str.match(BC_REGEX)
|
51
|
+
# most date strings have a four digit year
|
52
|
+
result ||= sortable_year_for_yyyy
|
53
|
+
# 2 digit year will always be 19xx or 20xx; sortable version will make a good facet string
|
54
|
+
result ||= sortable_year_for_yy
|
55
|
+
# decades are always 19xx or 20xx; sortable version will make a good facet string
|
56
|
+
result ||= sortable_year_for_decade
|
57
|
+
unless result
|
58
|
+
# try removing brackets between digits in case we have 169[5] or [18]91
|
59
|
+
if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
|
60
|
+
no_brackets = orig_date_str.delete('[]')
|
61
|
+
return DateParsing.new(no_brackets).facet_string_from_date_str
|
62
|
+
end
|
63
|
+
end
|
64
|
+
# parsing below this line gives string inapprop for year_str_valid?
|
65
|
+
unless self.class.year_str_valid?(result)
|
66
|
+
result = facet_string_for_century
|
67
|
+
result ||= facet_string_for_early_numeric
|
68
|
+
end
|
69
|
+
# remove leading 0s from early dates
|
70
|
+
result = result.to_i.to_s if result && result.match(/^\d+$/)
|
71
|
+
result
|
72
|
+
end
|
73
|
+
|
74
|
+
# get String sortable value year if we can parse date_str to get a year.
|
75
|
+
# SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
|
76
|
+
# The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
|
77
|
+
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
78
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
79
|
+
def sortable_year_string_from_date_str
|
80
|
+
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
81
|
+
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
82
|
+
return sortable_year_for_bc if orig_date_str.match(BC_REGEX)
|
83
|
+
# most date strings have a four digit year
|
84
|
+
result = sortable_year_for_yyyy
|
85
|
+
result ||= sortable_year_for_yy
|
86
|
+
result ||= sortable_year_for_decade
|
87
|
+
result ||= sortable_year_for_century
|
88
|
+
result ||= sortable_year_for_early_numeric
|
89
|
+
unless result
|
90
|
+
# try removing brackets between digits in case we have 169[5] or [18]91
|
91
|
+
if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
|
92
|
+
no_brackets = orig_date_str.delete('[]')
|
93
|
+
return DateParsing.new(no_brackets).sortable_year_string_from_date_str
|
94
|
+
end
|
95
|
+
end
|
96
|
+
result if self.class.year_str_valid?(result)
|
97
|
+
end
|
98
|
+
|
99
|
+
# looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
|
100
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
|
101
|
+
def sortable_year_for_yyyy
|
102
|
+
matches = orig_date_str.match(/\d{4}/) if orig_date_str
|
103
|
+
return matches.to_s if matches
|
104
|
+
end
|
105
|
+
|
106
|
+
# returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
|
107
|
+
# note that these are the only 2 digit year patterns found in our actual date strings in MODS records
|
108
|
+
# we use 20 as century digits unless it is greater than current year:
|
109
|
+
# 1/1/15 -> 2015
|
110
|
+
# 1/1/25 -> 1925
|
111
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
|
112
|
+
def sortable_year_for_yy
|
113
|
+
return unless orig_date_str
|
114
|
+
slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
|
115
|
+
if slash_matches
|
116
|
+
date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
|
117
|
+
else
|
118
|
+
hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
|
119
|
+
date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
|
120
|
+
end
|
121
|
+
if date_obj && date_obj > Date.today
|
122
|
+
date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
|
123
|
+
end
|
124
|
+
date_obj.year.to_s if date_obj
|
125
|
+
rescue ArgumentError
|
126
|
+
nil # explicitly want nil if date won't parse
|
127
|
+
end
|
128
|
+
|
129
|
+
# get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
130
|
+
# note that these are the only decade patterns found in our actual date strings in MODS records
|
131
|
+
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
|
132
|
+
def sortable_year_for_decade
|
133
|
+
decade_matches = orig_date_str.match(/\d{3}[u\-?x]/) if orig_date_str
|
134
|
+
if decade_matches
|
135
|
+
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0')
|
136
|
+
return DateParsing.new(changed_to_zero).sortable_year_for_yyyy
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
|
141
|
+
CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}')
|
142
|
+
|
143
|
+
# get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
|
144
|
+
# note that these are the only century patterns found in our actual date strings in MODS records
|
145
|
+
# @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
146
|
+
def sortable_year_for_century
|
147
|
+
return unless orig_date_str
|
148
|
+
return if orig_date_str.match(/B\.C\./)
|
149
|
+
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
150
|
+
if century_matches
|
151
|
+
return $1 + '00' if $1.length == 2
|
152
|
+
return '0' + $1 + '00' if $1.length == 1
|
153
|
+
end
|
154
|
+
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
155
|
+
if century_str_matches
|
156
|
+
yy = ($1.to_i - 1).to_s
|
157
|
+
return yy + '00' if yy.length == 2
|
158
|
+
return '0' + yy + '00' if yy.length == 1
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# get single facet value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
|
163
|
+
# note that these are the only century patterns found in our actual date strings in MODS records
|
164
|
+
# @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
165
|
+
def facet_string_for_century
|
166
|
+
return unless orig_date_str
|
167
|
+
return if orig_date_str.match(/B\.C\./)
|
168
|
+
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
169
|
+
return century_str_matches.to_s if century_str_matches
|
170
|
+
|
171
|
+
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
172
|
+
if century_matches
|
173
|
+
require 'active_support/core_ext/integer/inflections'
|
174
|
+
return "#{($1.to_i + 1).ordinalize} century"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
|
179
|
+
|
180
|
+
# get String sortable value for B.C. if we have B.C. pattern
|
181
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
182
|
+
# We know our data does not contain B.C. dates older than 999, so we can make them
|
183
|
+
# lexically sort by subtracting 1000. So we get:
|
184
|
+
# -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
|
185
|
+
# @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
|
186
|
+
def sortable_year_for_bc
|
187
|
+
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
188
|
+
return ($1.to_i - 1000).to_s if bc_matches
|
189
|
+
end
|
190
|
+
|
191
|
+
# get single facet value for B.C. if we have B.C. pattern
|
192
|
+
# @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
|
193
|
+
def facet_string_for_bc
|
194
|
+
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
195
|
+
return bc_matches.to_s if bc_matches
|
196
|
+
end
|
197
|
+
|
198
|
+
EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
|
199
|
+
|
200
|
+
# get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
|
201
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
202
|
+
# We know our data does not contain negative dates older than -999, so we can make them
|
203
|
+
# lexically sort by subtracting 1000. So we get:
|
204
|
+
# -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
|
205
|
+
# @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
|
206
|
+
def sortable_year_for_early_numeric
|
207
|
+
return unless orig_date_str.match(EARLY_NUMERIC)
|
208
|
+
if orig_date_str.match(/^\-/)
|
209
|
+
# negative number becomes x - 1000 for sorting; -005 for -995
|
210
|
+
num = orig_date_str[1..-1].to_i - 1000
|
211
|
+
return '-' + num.to_s[1..-1].rjust(3, '0')
|
212
|
+
else
|
213
|
+
return orig_date_str.rjust(4, '0')
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# get single facet value for date String containing yyy, yy, y, -y, -yy, -yyy
|
218
|
+
# negative number strings will be changed to B.C. strings
|
219
|
+
def facet_string_for_early_numeric
|
220
|
+
return unless orig_date_str.match(EARLY_NUMERIC)
|
221
|
+
# negative number becomes B.C.
|
222
|
+
return orig_date_str[1..-1] + " B.C." if orig_date_str.match(/^\-/)
|
223
|
+
# remove leading 0s from early dates
|
224
|
+
orig_date_str.to_i.to_s
|
225
|
+
end
|
226
|
+
|
227
|
+
# NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
|
228
|
+
# actually works for nearly all those cases and a lot more besides. Trial and error
|
229
|
+
# with an extensive set of test data culled from actual date strings in our MODS records
|
230
|
+
# has made this method bogus.
|
231
|
+
# @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
|
232
|
+
def year_via_ruby_parsing
|
233
|
+
return unless orig_date_str.match(/\d\d/) # need at least 2 digits
|
234
|
+
# need more in string than only 2 digits
|
235
|
+
return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
|
236
|
+
return if orig_date_str.match(/\d\s*B.C./) # skip B.C. dates
|
237
|
+
date_obj = Date.parse(orig_date_str)
|
238
|
+
date_obj.year.to_s
|
239
|
+
rescue ArgumentError
|
240
|
+
nil # explicitly want nil if date won't parse
|
241
|
+
end
|
242
|
+
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
@@ -0,0 +1,411 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'mods'
|
3
|
+
|
4
|
+
# Parsing MODS /originInfo for Publication/Imprint data:
|
5
|
+
# * pub year for date slider facet
|
6
|
+
# * pub year for sorting
|
7
|
+
# * pub year for single facet value
|
8
|
+
# * imprint info for display
|
9
|
+
# *
|
10
|
+
# These methods may be used by searchworks.rb file or by downstream apps
|
11
|
+
module Stanford
|
12
|
+
module Mods
|
13
|
+
class Record < ::Mods::Record
|
14
|
+
|
15
|
+
# return a single string intended for facet use for pub date
|
16
|
+
# prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
|
17
|
+
# look for a keyDate and use it if there is one; otherwise pick earliest date
|
18
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
19
|
+
# should be ignored; false if approximate dates should be included
|
20
|
+
# @return [String] single String containing publication year for facet use
|
21
|
+
def pub_date_facet_single_value(ignore_approximate = false)
|
22
|
+
# prefer dateIssued
|
23
|
+
result = pub_date_best_single_facet_value(date_issued_elements(ignore_approximate))
|
24
|
+
result ||= pub_date_best_single_facet_value(date_created_elements(ignore_approximate))
|
25
|
+
# dateCaptured for web archive seed records
|
26
|
+
result ||= pub_date_best_single_facet_value(@mods_ng_xml.origin_info.dateCaptured.to_a)
|
27
|
+
result
|
28
|
+
end
|
29
|
+
|
30
|
+
# return a single string intended for lexical sorting for pub date
|
31
|
+
# prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
|
32
|
+
# look for a keyDate and use it if there is one; otherwise pick earliest date
|
33
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
34
|
+
# should be ignored; false if approximate dates should be included
|
35
|
+
# @return [String] single String containing publication year for lexical sorting
|
36
|
+
# note that for string sorting 5 B.C. = -5 => -995; 6 B.C. => -994 so 6 B.C. sorts before 5 B.C.
|
37
|
+
def pub_date_sortable_string(ignore_approximate = false)
|
38
|
+
# prefer dateIssued
|
39
|
+
result = pub_date_best_sort_str_value(date_issued_elements(ignore_approximate))
|
40
|
+
result ||= pub_date_best_sort_str_value(date_created_elements(ignore_approximate))
|
41
|
+
# dateCaptured for web archive seed records
|
42
|
+
result ||= pub_date_best_sort_str_value(@mods_ng_xml.origin_info.dateCaptured.to_a)
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
# given the passed date elements, look for a single keyDate and use it if there is one;
|
47
|
+
# otherwise pick earliest parseable date
|
48
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
49
|
+
# @return [String] single String containing publication year for facet use
|
50
|
+
def pub_date_best_single_facet_value(date_el_array)
|
51
|
+
return if date_el_array.empty?
|
52
|
+
# prefer keyDate
|
53
|
+
key_date_el = self.class.keyDate(date_el_array)
|
54
|
+
result = DateParsing.facet_string_from_date_str(key_date_el.content) if key_date_el
|
55
|
+
return result if result
|
56
|
+
# settle for earliest parseable date
|
57
|
+
_ignore, orig_str_to_parse = self.class.earliest_date(date_el_array)
|
58
|
+
DateParsing.facet_string_from_date_str(orig_str_to_parse) if orig_str_to_parse
|
59
|
+
end
|
60
|
+
|
61
|
+
# given the passed date elements, look for a single keyDate and use it if there is one;
|
62
|
+
# otherwise pick earliest parseable date
|
63
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
64
|
+
# @return [String] single String containing publication year for lexical sorting
|
65
|
+
def pub_date_best_sort_str_value(date_el_array)
|
66
|
+
return if date_el_array.empty?
|
67
|
+
# prefer keyDate
|
68
|
+
key_date_el = self.class.keyDate(date_el_array)
|
69
|
+
result = DateParsing.sortable_year_string_from_date_str(key_date_el.content) if key_date_el
|
70
|
+
return result if result
|
71
|
+
# settle for earliest parseable date
|
72
|
+
sortable_str, _ignore = self.class.earliest_date(date_el_array)
|
73
|
+
sortable_str if sortable_str
|
74
|
+
end
|
75
|
+
|
76
|
+
protected :pub_date_best_single_facet_value, :pub_date_best_sort_str_value
|
77
|
+
|
78
|
+
# return /originInfo/dateCreated elements in MODS records
|
79
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
80
|
+
# should be excluded; false approximate dates should be included
|
81
|
+
# @return [Array<Nokogiri::XML::Element>]
|
82
|
+
def date_created_elements(ignore_approximate=false)
|
83
|
+
date_created_nodeset = @mods_ng_xml.origin_info.dateCreated
|
84
|
+
return self.class.remove_approximate(date_created_nodeset) if ignore_approximate
|
85
|
+
date_created_nodeset.to_a
|
86
|
+
end
|
87
|
+
|
88
|
+
# return /originInfo/dateIssued elements in MODS records
|
89
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
90
|
+
# should be excluded; false approximate dates should be included
|
91
|
+
# @return [Array<Nokogiri::XML::Element>]
|
92
|
+
def date_issued_elements(ignore_approximate=false)
|
93
|
+
date_issued_nodeset = @mods_ng_xml.origin_info.dateIssued
|
94
|
+
return self.class.remove_approximate(date_issued_nodeset) if ignore_approximate
|
95
|
+
date_issued_nodeset.to_a
|
96
|
+
end
|
97
|
+
|
98
|
+
# given a set of date elements, return the single element with attribute keyDate="yes"
|
99
|
+
# or return nil if no elements have attribute keyDate="yes", or if multiple elements have keyDate="yes"
|
100
|
+
# @param [Array<Nokogiri::XML::Element>] Array of date elements
|
101
|
+
# @return [Nokogiri::XML::Element, nil] single date element with attribute keyDate="yes", or nil
|
102
|
+
def self.keyDate(elements)
|
103
|
+
keyDates = elements.select { |node| node["keyDate"] == 'yes' }
|
104
|
+
keyDates.first if keyDates.size == 1
|
105
|
+
end
|
106
|
+
|
107
|
+
# remove Elements from NodeSet if they have a qualifier attribute of 'approximate' or 'questionable'
|
108
|
+
# @param [Nokogiri::XML::NodeSet<Nokogiri::XML::Element>] nodeset set of date elements
|
109
|
+
# @return [Array<Nokogiri::XML::Element>] the set of date elements minus any that
|
110
|
+
# had a qualifier attribute of 'approximate' or 'questionable'
|
111
|
+
def self.remove_approximate(nodeset)
|
112
|
+
nodeset.select { |node| node unless date_is_approximate?(node) }
|
113
|
+
end
|
114
|
+
|
115
|
+
# NOTE: legal values for MODS date elements with attribute qualifier are
|
116
|
+
# 'approximate', 'inferred' or 'questionable'
|
117
|
+
# @param [Nokogiri::XML::Element] date_element MODS date element
|
118
|
+
# @return [Boolean] true if date_element has a qualifier attribute of "approximate" or "questionable",
|
119
|
+
# false if no qualifier attribute, or if attribute is 'inferred' or some other value
|
120
|
+
def self.date_is_approximate?(date_element)
|
121
|
+
qualifier = date_element["qualifier"] if date_element.respond_to?('[]')
|
122
|
+
qualifier == 'approximate' || qualifier == 'questionable'
|
123
|
+
end
|
124
|
+
|
125
|
+
# get earliest parseable date from the passed date elements
|
126
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
127
|
+
# @return two String values:
|
128
|
+
# the first is the lexically sortable String value of the earliest date;
|
129
|
+
# the second is the original String value of the chosen element
|
130
|
+
def self.earliest_date(date_el_array)
|
131
|
+
poss_results = {}
|
132
|
+
date_el_array.each { |el|
|
133
|
+
result = DateParsing.sortable_year_string_from_date_str(el.content)
|
134
|
+
poss_results[result] = el.content if result
|
135
|
+
}
|
136
|
+
earliest = poss_results.keys.sort.first if poss_results.present?
|
137
|
+
return earliest, poss_results[earliest] if earliest
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# ---- old date parsing methods used downstream of gem; will be deprecated/replaced with new date parsing methods
|
142
|
+
|
143
|
+
def place
|
144
|
+
vals = self.term_values([:origin_info, :place, :placeTerm])
|
145
|
+
vals
|
146
|
+
end
|
147
|
+
|
148
|
+
# For the date display only, the first place to look is in the dates without encoding=marc array.
|
149
|
+
# If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil
|
150
|
+
# @return [String] value for the pub_date_display Solr field for this document or nil if none
|
151
|
+
def pub_date_display
|
152
|
+
return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
|
153
|
+
return dates_marc_encoding.first unless dates_marc_encoding.empty?
|
154
|
+
nil
|
155
|
+
end
|
156
|
+
|
157
|
+
# For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
|
158
|
+
# If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil
|
159
|
+
# @return [Array<String>] values for the date Solr field for this document or nil if none
|
160
|
+
def pub_dates
|
161
|
+
return dates_marc_encoding unless dates_marc_encoding.empty?
|
162
|
+
return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
166
|
+
# Get the publish year from mods
|
167
|
+
# @return [String] 4 character year or nil if no valid date was found
|
168
|
+
def pub_year
|
169
|
+
# use the cached year if there is one
|
170
|
+
if @pub_year
|
171
|
+
return nil if @pub_year == ''
|
172
|
+
return @pub_year
|
173
|
+
end
|
174
|
+
|
175
|
+
dates = pub_dates
|
176
|
+
if dates
|
177
|
+
pruned_dates = []
|
178
|
+
dates.each do |f_date|
|
179
|
+
# remove ? and []
|
180
|
+
if f_date.length == 4 && f_date.end_with?('?')
|
181
|
+
pruned_dates << f_date.tr('?', '0')
|
182
|
+
else
|
183
|
+
pruned_dates << f_date.delete('?[]')
|
184
|
+
end
|
185
|
+
end
|
186
|
+
# try to find a date starting with the most normal date formats and progressing to more wonky ones
|
187
|
+
@pub_year = get_plain_four_digit_year pruned_dates
|
188
|
+
return @pub_year if @pub_year
|
189
|
+
# Check for years in u notation, e.g., 198u
|
190
|
+
@pub_year = get_u_year pruned_dates
|
191
|
+
return @pub_year if @pub_year
|
192
|
+
@pub_year = get_double_digit_century pruned_dates
|
193
|
+
return @pub_year if @pub_year
|
194
|
+
@pub_year = get_bc_year pruned_dates
|
195
|
+
return @pub_year if @pub_year
|
196
|
+
@pub_year = get_three_digit_year pruned_dates
|
197
|
+
return @pub_year if @pub_year
|
198
|
+
@pub_year = get_single_digit_century pruned_dates
|
199
|
+
return @pub_year if @pub_year
|
200
|
+
end
|
201
|
+
@pub_year = ''
|
202
|
+
nil
|
203
|
+
end
|
204
|
+
|
205
|
+
# creates a date suitable for sorting. Guarnteed to be 4 digits or nil
|
206
|
+
def pub_date_sort
|
207
|
+
if pub_date
|
208
|
+
pd = pub_date
|
209
|
+
pd = '0' + pd if pd.length == 3
|
210
|
+
pd = pd.gsub('--', '00')
|
211
|
+
end
|
212
|
+
fail "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd && pd.length != 4
|
213
|
+
pd
|
214
|
+
end
|
215
|
+
|
216
|
+
# The year the object was published, filtered based on max_pub_date and min_pub_date from the config file
|
217
|
+
# @return [String] 4 character year or nil
|
218
|
+
def pub_date
|
219
|
+
pub_year || nil
|
220
|
+
end
|
221
|
+
|
222
|
+
# Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
|
223
|
+
# @return <Array[String]> with values for the pub date facet
|
224
|
+
def pub_date_facet
|
225
|
+
if pub_date
|
226
|
+
if pub_date.start_with?('-')
|
227
|
+
return (pub_date.to_i + 1000).to_s + ' B.C.'
|
228
|
+
end
|
229
|
+
if pub_date.include? '--'
|
230
|
+
cent = pub_date[0, 2].to_i
|
231
|
+
cent += 1
|
232
|
+
cent = cent.to_s + 'th century'
|
233
|
+
return cent
|
234
|
+
else
|
235
|
+
return pub_date
|
236
|
+
end
|
237
|
+
end
|
238
|
+
nil
|
239
|
+
end
|
240
|
+
|
241
|
+
# ---- old date parsing methods will be deprecated/replaced with new date parsing methods
|
242
|
+
|
243
|
+
protected
|
244
|
+
|
245
|
+
# @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
|
246
|
+
def dates_marc_encoding
|
247
|
+
@dates_marc_encoding ||= begin
|
248
|
+
parse_dates_from_originInfo
|
249
|
+
@dates_marc_encoding
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
|
254
|
+
def dates_no_marc_encoding
|
255
|
+
@dates_no_marc_encoding ||= begin
|
256
|
+
parse_dates_from_originInfo
|
257
|
+
@dates_no_marc_encoding
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
|
262
|
+
# with and without encoding=marc
|
263
|
+
def parse_dates_from_originInfo
|
264
|
+
@dates_marc_encoding = []
|
265
|
+
@dates_no_marc_encoding = []
|
266
|
+
self.origin_info.dateIssued.each { |di|
|
267
|
+
if di.encoding == "marc"
|
268
|
+
@dates_marc_encoding << di.text
|
269
|
+
else
|
270
|
+
@dates_no_marc_encoding << di.text
|
271
|
+
end
|
272
|
+
}
|
273
|
+
self.origin_info.dateCreated.each { |dc|
|
274
|
+
if dc.encoding == "marc"
|
275
|
+
@dates_marc_encoding << dc.text
|
276
|
+
else
|
277
|
+
@dates_no_marc_encoding << dc.text
|
278
|
+
end
|
279
|
+
}
|
280
|
+
end
|
281
|
+
|
282
|
+
|
283
|
+
def is_number?(object)
|
284
|
+
true if Integer(object) rescue false
|
285
|
+
end
|
286
|
+
|
287
|
+
def is_date?(object)
|
288
|
+
true if Date.parse(object) rescue false
|
289
|
+
end
|
290
|
+
|
291
|
+
# TODO: need tests for these methods
|
292
|
+
|
293
|
+
# get a 4 digit year like 1865 from array of dates
|
294
|
+
# @param [Array<String>] dates an array of potential year strings
|
295
|
+
def get_plain_four_digit_year(dates)
|
296
|
+
dates.each do |f_date|
|
297
|
+
matches = f_date.scan(/\d{4}/)
|
298
|
+
if matches.length == 1
|
299
|
+
@pub_year = matches.first
|
300
|
+
return matches.first
|
301
|
+
else
|
302
|
+
# when there are multiple matches, check for ones with CE after them
|
303
|
+
matches.each do |match|
|
304
|
+
# look for things like '1865-6 CE'
|
305
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
306
|
+
pos = pos ? pos.to_i : 0
|
307
|
+
if f_date.include?(match+' CE') or pos > 0
|
308
|
+
@pub_year = match
|
309
|
+
return match
|
310
|
+
end
|
311
|
+
end
|
312
|
+
return matches.first
|
313
|
+
end
|
314
|
+
end
|
315
|
+
nil
|
316
|
+
end
|
317
|
+
|
318
|
+
# get a 3 digit year like 965 from the date array
|
319
|
+
# @param [Array<String>] dates an array of potential year strings
|
320
|
+
def get_three_digit_year(dates)
|
321
|
+
dates.each do |f_date|
|
322
|
+
matches = f_date.scan(/\d{3}/)
|
323
|
+
return matches.first if matches.length > 0
|
324
|
+
end
|
325
|
+
nil
|
326
|
+
end
|
327
|
+
|
328
|
+
# get the 3 digit BC year, return it as a negative, so -700 for 300 BC.
|
329
|
+
# Other methods will translate it to proper display, this is good for sorting.
|
330
|
+
# @param [Array<String>] dates an array of potential year strings
|
331
|
+
def get_bc_year(dates)
|
332
|
+
dates.each do |f_date|
|
333
|
+
matches = f_date.scan(/\d{3} B.C./)
|
334
|
+
if matches.length > 0
|
335
|
+
bc_year = matches.first[0..2]
|
336
|
+
return (bc_year.to_i - 1000).to_s
|
337
|
+
end
|
338
|
+
end
|
339
|
+
nil
|
340
|
+
end
|
341
|
+
|
342
|
+
# get a single digit century like '9th century' from the date array
|
343
|
+
# @param [Array<String>] dates an array of potential year strings
|
344
|
+
# @return [String] y-- if we identify century digit in string
|
345
|
+
def get_single_digit_century(dates)
|
346
|
+
dates.each do |f_date|
|
347
|
+
matches = f_date.scan(/\d{1}th/)
|
348
|
+
next if matches.length == 0
|
349
|
+
if matches.length == 1
|
350
|
+
@pub_year = ((matches.first[0, 2].to_i) - 1).to_s + '--'
|
351
|
+
return @pub_year
|
352
|
+
else
|
353
|
+
# when there are multiple matches, check for ones with CE after them
|
354
|
+
matches.each do |match|
|
355
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
356
|
+
pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
|
357
|
+
pos = pos ? pos.to_i : 0
|
358
|
+
if f_date.include?(match + ' CE') || pos > 0
|
359
|
+
@pub_year = ((match[0, 1].to_i) - 1).to_s + '--'
|
360
|
+
return @pub_year
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
364
|
+
end
|
365
|
+
nil
|
366
|
+
end
|
367
|
+
|
368
|
+
# get a double digit century like '12th century' from the date array
|
369
|
+
# @param [Array<String>] dates an array of potential year strings
|
370
|
+
# @return [String] yy-- if we identify century digits in string
|
371
|
+
def get_double_digit_century(dates)
|
372
|
+
dates.each do |f_date|
|
373
|
+
matches = f_date.scan(/\d{2}th/)
|
374
|
+
next if matches.length == 0
|
375
|
+
if matches.length == 1
|
376
|
+
@pub_year=((matches.first[0, 2].to_i) - 1).to_s + '--'
|
377
|
+
return @pub_year
|
378
|
+
else
|
379
|
+
# when there are multiple matches, check for ones with CE after them
|
380
|
+
matches.each do |match|
|
381
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
382
|
+
pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
|
383
|
+
pos = pos ? pos.to_i : 0
|
384
|
+
if f_date.include?(match+' CE') or pos > 0
|
385
|
+
@pub_year = ((match[0, 2].to_i) - 1).to_s + '--'
|
386
|
+
return @pub_year
|
387
|
+
end
|
388
|
+
end
|
389
|
+
end
|
390
|
+
end
|
391
|
+
nil
|
392
|
+
end
|
393
|
+
|
394
|
+
# If a year has a "u" in it, replace u with 0 for yyyu (becomes yyy0)
|
395
|
+
# and replace u with '-' for yyuu (becomes yy--)
|
396
|
+
# @param [String] dates looking for matches on yyyu or yyuu in these strings
|
397
|
+
# @return [String, nil] String of format yyy0 or yy--, or nil
|
398
|
+
def get_u_year(dates)
|
399
|
+
dates.each do |f_date|
|
400
|
+
# Single digit u notation
|
401
|
+
matches = f_date.scan(/\d{3}u/)
|
402
|
+
return matches.first.tr('u', '0') if matches.length == 1
|
403
|
+
# Double digit u notation
|
404
|
+
matches = f_date.scan(/\d{2}u{2}/)
|
405
|
+
return matches.first.tr('u', '-') if matches.length == 1
|
406
|
+
end
|
407
|
+
nil
|
408
|
+
end
|
409
|
+
end # class Record
|
410
|
+
end
|
411
|
+
end
|