stanford-mods 1.3.3 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rspec +1 -0
- data/.rubocop.yml +4 -0
- data/Gemfile +1 -0
- data/lib/stanford-mods.rb +5 -5
- data/lib/stanford-mods/date_parsing.rb +245 -0
- data/lib/stanford-mods/origin_info.rb +411 -0
- data/lib/stanford-mods/searchworks.rb +23 -474
- data/lib/stanford-mods/searchworks_subjects.rb +208 -0
- data/lib/stanford-mods/version.rb +1 -1
- data/spec/date_parsing_spec.rb +746 -0
- data/spec/fixtures/spotlight_pub_date_data.rb +316 -0
- data/spec/origin_info_spec.rb +449 -0
- data/spec/searchworks_pub_dates_spec.rb +166 -163
- data/spec/spec_helper.rb +16 -5
- data/stanford-mods.gemspec +2 -0
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ea1bae95a44c285bf8594fa40a73c2aa0b328d1
|
4
|
+
data.tar.gz: 479f7c52ae3c7c29592a870a819e10e6e1abb692
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6f535642244577ecfb2f7cc2d4b4291c9a7cb052869543cf989f6cce72cb5b8609e668e4ace238940c2c252b8613e63908b6627aef608b5e716068b52de23f5
|
7
|
+
data.tar.gz: 97db3a6affbc9e74b62432961d712ace6f860cbf77b36137f22287f2e3c4f491d5cbfeeb24a540bd7c48ed21e38e91663d0dd9436e2cc62e7835bc50bd319a11
|
data/.gitignore
CHANGED
data/.rspec
CHANGED
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
data/lib/stanford-mods.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require 'stanford-mods/version'
|
2
1
|
require 'mods'
|
2
|
+
require 'stanford-mods/date_parsing'
|
3
|
+
require 'stanford-mods/geo_spatial'
|
3
4
|
require 'stanford-mods/name'
|
4
|
-
require 'stanford-mods/
|
5
|
+
require 'stanford-mods/origin_info'
|
5
6
|
require 'stanford-mods/physical_location'
|
6
|
-
require 'stanford-mods/
|
7
|
+
require 'stanford-mods/searchworks'
|
8
|
+
require 'stanford-mods/version'
|
7
9
|
|
8
10
|
# Stanford specific wranglings of MODS metadata as an extension of the Mods::Record object
|
9
11
|
module Stanford
|
10
12
|
module Mods
|
11
|
-
|
12
13
|
class Record < ::Mods::Record
|
13
|
-
|
14
14
|
end # Record class
|
15
15
|
end # Mods module
|
16
16
|
end # Stanford module
|
@@ -0,0 +1,245 @@
|
|
1
|
+
module Stanford
|
2
|
+
module Mods
|
3
|
+
# Parsing date strings
|
4
|
+
# TODO: this should become its own gem and/or become eclipsed by/merged with timetwister gem
|
5
|
+
# When this is "gemified":
|
6
|
+
# - we may want an integer or date sort field as well as lexical
|
7
|
+
# - we could add methods like my_date.bc?
|
8
|
+
class DateParsing
|
9
|
+
|
10
|
+
# get single facet value for date, generally an explicit year or "17th century" or "5 B.C."
|
11
|
+
# returns '845', not 0845
|
12
|
+
# @param [String] date_str String containing a date (we hope)
|
13
|
+
# @return [String, nil] String facet value for year if we could parse one, nil otherwise
|
14
|
+
def self.facet_string_from_date_str(date_str)
|
15
|
+
return DateParsing.new(date_str).facet_string_from_date_str
|
16
|
+
end
|
17
|
+
|
18
|
+
# get String sortable value year if we can parse date_str to get a year.
|
19
|
+
# SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
|
20
|
+
# The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
|
21
|
+
# @param [String] date_str String containing a date (we hope)
|
22
|
+
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
23
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
24
|
+
def self.sortable_year_string_from_date_str(date_str)
|
25
|
+
return DateParsing.new(date_str).sortable_year_string_from_date_str
|
26
|
+
end
|
27
|
+
|
28
|
+
# true if the year is between -999 and (current year + 1)
|
29
|
+
# @param [String] year_str String containing a date in format: -yyy, -yy, -y, y, yy, yyy, yyyy
|
30
|
+
# @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
|
31
|
+
def self.year_str_valid?(year_str)
|
32
|
+
return false unless year_str && (year_str.match(/^\d{1,4}$/) || year_str.match(/^-\d{1,3}$/))
|
33
|
+
(-1000 < year_str.to_i) && (year_str.to_i < Date.today.year + 2)
|
34
|
+
end
|
35
|
+
|
36
|
+
attr_reader :orig_date_str
|
37
|
+
|
38
|
+
def initialize(date_str)
|
39
|
+
@orig_date_str = date_str
|
40
|
+
@orig_date_str.freeze
|
41
|
+
end
|
42
|
+
|
43
|
+
BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
|
44
|
+
|
45
|
+
# get single facet value for date, generally an explicit year or "17th century" or "5 B.C."
|
46
|
+
# @return [String, nil] String facet value for year if we could parse one, nil otherwise
|
47
|
+
def facet_string_from_date_str
|
48
|
+
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
49
|
+
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
50
|
+
return facet_string_for_bc if orig_date_str.match(BC_REGEX)
|
51
|
+
# most date strings have a four digit year
|
52
|
+
result ||= sortable_year_for_yyyy
|
53
|
+
# 2 digit year will always be 19xx or 20xx; sortable version will make a good facet string
|
54
|
+
result ||= sortable_year_for_yy
|
55
|
+
# decades are always 19xx or 20xx; sortable version will make a good facet string
|
56
|
+
result ||= sortable_year_for_decade
|
57
|
+
unless result
|
58
|
+
# try removing brackets between digits in case we have 169[5] or [18]91
|
59
|
+
if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
|
60
|
+
no_brackets = orig_date_str.delete('[]')
|
61
|
+
return DateParsing.new(no_brackets).facet_string_from_date_str
|
62
|
+
end
|
63
|
+
end
|
64
|
+
# parsing below this line gives string inapprop for year_str_valid?
|
65
|
+
unless self.class.year_str_valid?(result)
|
66
|
+
result = facet_string_for_century
|
67
|
+
result ||= facet_string_for_early_numeric
|
68
|
+
end
|
69
|
+
# remove leading 0s from early dates
|
70
|
+
result = result.to_i.to_s if result && result.match(/^\d+$/)
|
71
|
+
result
|
72
|
+
end
|
73
|
+
|
74
|
+
# get String sortable value year if we can parse date_str to get a year.
|
75
|
+
# SearchWorks currently uses a string field for pub date sorting; thus so does Spotlight.
|
76
|
+
# The values returned must *lexically* sort in chronological order, so the B.C. dates are tricky
|
77
|
+
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
78
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
79
|
+
def sortable_year_string_from_date_str
|
80
|
+
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
81
|
+
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
82
|
+
return sortable_year_for_bc if orig_date_str.match(BC_REGEX)
|
83
|
+
# most date strings have a four digit year
|
84
|
+
result = sortable_year_for_yyyy
|
85
|
+
result ||= sortable_year_for_yy
|
86
|
+
result ||= sortable_year_for_decade
|
87
|
+
result ||= sortable_year_for_century
|
88
|
+
result ||= sortable_year_for_early_numeric
|
89
|
+
unless result
|
90
|
+
# try removing brackets between digits in case we have 169[5] or [18]91
|
91
|
+
if orig_date_str.match(BRACKETS_BETWEEN_DIGITS_REXEXP)
|
92
|
+
no_brackets = orig_date_str.delete('[]')
|
93
|
+
return DateParsing.new(no_brackets).sortable_year_string_from_date_str
|
94
|
+
end
|
95
|
+
end
|
96
|
+
result if self.class.year_str_valid?(result)
|
97
|
+
end
|
98
|
+
|
99
|
+
# looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
|
100
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
|
101
|
+
def sortable_year_for_yyyy
|
102
|
+
matches = orig_date_str.match(/\d{4}/) if orig_date_str
|
103
|
+
return matches.to_s if matches
|
104
|
+
end
|
105
|
+
|
106
|
+
# returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
|
107
|
+
# note that these are the only 2 digit year patterns found in our actual date strings in MODS records
|
108
|
+
# we use 20 as century digits unless it is greater than current year:
|
109
|
+
# 1/1/15 -> 2015
|
110
|
+
# 1/1/25 -> 1925
|
111
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
|
112
|
+
def sortable_year_for_yy
|
113
|
+
return unless orig_date_str
|
114
|
+
slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
|
115
|
+
if slash_matches
|
116
|
+
date_obj = Date.strptime(orig_date_str, '%m/%d/%y')
|
117
|
+
else
|
118
|
+
hyphen_matches = orig_date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
|
119
|
+
date_obj = Date.strptime(orig_date_str, '%m-%d-%y') if hyphen_matches
|
120
|
+
end
|
121
|
+
if date_obj && date_obj > Date.today
|
122
|
+
date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday)
|
123
|
+
end
|
124
|
+
date_obj.year.to_s if date_obj
|
125
|
+
rescue ArgumentError
|
126
|
+
nil # explicitly want nil if date won't parse
|
127
|
+
end
|
128
|
+
|
129
|
+
# get first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
130
|
+
# note that these are the only decade patterns found in our actual date strings in MODS records
|
131
|
+
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
|
132
|
+
def sortable_year_for_decade
|
133
|
+
decade_matches = orig_date_str.match(/\d{3}[u\-?x]/) if orig_date_str
|
134
|
+
if decade_matches
|
135
|
+
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0')
|
136
|
+
return DateParsing.new(changed_to_zero).sortable_year_for_yyyy
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
|
141
|
+
CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}')
|
142
|
+
|
143
|
+
# get first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
|
144
|
+
# note that these are the only century patterns found in our actual date strings in MODS records
|
145
|
+
# @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
146
|
+
def sortable_year_for_century
|
147
|
+
return unless orig_date_str
|
148
|
+
return if orig_date_str.match(/B\.C\./)
|
149
|
+
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
150
|
+
if century_matches
|
151
|
+
return $1 + '00' if $1.length == 2
|
152
|
+
return '0' + $1 + '00' if $1.length == 1
|
153
|
+
end
|
154
|
+
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
155
|
+
if century_str_matches
|
156
|
+
yy = ($1.to_i - 1).to_s
|
157
|
+
return yy + '00' if yy.length == 2
|
158
|
+
return '0' + yy + '00' if yy.length == 1
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# get single facet value for century (17th century) if we have: yyuu, yy--, yy--? or xxth century pattern
|
163
|
+
# note that these are the only century patterns found in our actual date strings in MODS records
|
164
|
+
# @return [String, nil] yy(th) Century if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
165
|
+
def facet_string_for_century
|
166
|
+
return unless orig_date_str
|
167
|
+
return if orig_date_str.match(/B\.C\./)
|
168
|
+
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
169
|
+
return century_str_matches.to_s if century_str_matches
|
170
|
+
|
171
|
+
century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
172
|
+
if century_matches
|
173
|
+
require 'active_support/core_ext/integer/inflections'
|
174
|
+
return "#{($1.to_i + 1).ordinalize} century"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
|
179
|
+
|
180
|
+
# get String sortable value for B.C. if we have B.C. pattern
|
181
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
182
|
+
# We know our data does not contain B.C. dates older than 999, so we can make them
|
183
|
+
# lexically sort by subtracting 1000. So we get:
|
184
|
+
# -700 for 300 B.C., -750 for 250 B.C., -800 for 200 B.C., -801 for 199 B.C.
|
185
|
+
# @return [String, nil] String sortable -ddd if B.C. in pattern; nil otherwise
|
186
|
+
def sortable_year_for_bc
|
187
|
+
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
188
|
+
return ($1.to_i - 1000).to_s if bc_matches
|
189
|
+
end
|
190
|
+
|
191
|
+
# get single facet value for B.C. if we have B.C. pattern
|
192
|
+
# @return [String, nil] ddd B.C. if ddd B.C. in pattern; nil otherwise
|
193
|
+
def facet_string_for_bc
|
194
|
+
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
195
|
+
return bc_matches.to_s if bc_matches
|
196
|
+
end
|
197
|
+
|
198
|
+
EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
|
199
|
+
|
200
|
+
# get String sortable value from date String containing yyy, yy, y, -y, -yy, -yyy
|
201
|
+
# note that these values must *lexically* sort to create a chronological sort.
|
202
|
+
# We know our data does not contain negative dates older than -999, so we can make them
|
203
|
+
# lexically sort by subtracting 1000. So we get:
|
204
|
+
# -983 for -17, -999 for -1, 0000 for 0, 0001 for 1, 0017 for 17
|
205
|
+
# @return [String, nil] String sortable -ddd if orig_date_str matches pattern; nil otherwise
|
206
|
+
def sortable_year_for_early_numeric
|
207
|
+
return unless orig_date_str.match(EARLY_NUMERIC)
|
208
|
+
if orig_date_str.match(/^\-/)
|
209
|
+
# negative number becomes x - 1000 for sorting; -005 for -995
|
210
|
+
num = orig_date_str[1..-1].to_i - 1000
|
211
|
+
return '-' + num.to_s[1..-1].rjust(3, '0')
|
212
|
+
else
|
213
|
+
return orig_date_str.rjust(4, '0')
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# get single facet value for date String containing yyy, yy, y, -y, -yy, -yyy
|
218
|
+
# negative number strings will be changed to B.C. strings
|
219
|
+
def facet_string_for_early_numeric
|
220
|
+
return unless orig_date_str.match(EARLY_NUMERIC)
|
221
|
+
# negative number becomes B.C.
|
222
|
+
return orig_date_str[1..-1] + " B.C." if orig_date_str.match(/^\-/)
|
223
|
+
# remove leading 0s from early dates
|
224
|
+
orig_date_str.to_i.to_s
|
225
|
+
end
|
226
|
+
|
227
|
+
# NOTE: while Date.parse() works for many dates, the *sortable_year_for_yyyy
|
228
|
+
# actually works for nearly all those cases and a lot more besides. Trial and error
|
229
|
+
# with an extensive set of test data culled from actual date strings in our MODS records
|
230
|
+
# has made this method bogus.
|
231
|
+
# @return [String, nil] sortable 4 digit year (e.g. 1865, 0950) if orig_date_str is parseable via ruby Date, nil otherwise
|
232
|
+
def year_via_ruby_parsing
|
233
|
+
return unless orig_date_str.match(/\d\d/) # need at least 2 digits
|
234
|
+
# need more in string than only 2 digits
|
235
|
+
return if orig_date_str.match(/^\d\d$/) || orig_date_str.match(/^\D*\d\d\D*$/)
|
236
|
+
return if orig_date_str.match(/\d\s*B.C./) # skip B.C. dates
|
237
|
+
date_obj = Date.parse(orig_date_str)
|
238
|
+
date_obj.year.to_s
|
239
|
+
rescue ArgumentError
|
240
|
+
nil # explicitly want nil if date won't parse
|
241
|
+
end
|
242
|
+
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
@@ -0,0 +1,411 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'mods'
|
3
|
+
|
4
|
+
# Parsing MODS /originInfo for Publication/Imprint data:
|
5
|
+
# * pub year for date slider facet
|
6
|
+
# * pub year for sorting
|
7
|
+
# * pub year for single facet value
|
8
|
+
# * imprint info for display
|
9
|
+
# *
|
10
|
+
# These methods may be used by searchworks.rb file or by downstream apps
|
11
|
+
module Stanford
|
12
|
+
module Mods
|
13
|
+
class Record < ::Mods::Record
|
14
|
+
|
15
|
+
# return a single string intended for facet use for pub date
|
16
|
+
# prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
|
17
|
+
# look for a keyDate and use it if there is one; otherwise pick earliest date
|
18
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
19
|
+
# should be ignored; false if approximate dates should be included
|
20
|
+
# @return [String] single String containing publication year for facet use
|
21
|
+
def pub_date_facet_single_value(ignore_approximate = false)
|
22
|
+
# prefer dateIssued
|
23
|
+
result = pub_date_best_single_facet_value(date_issued_elements(ignore_approximate))
|
24
|
+
result ||= pub_date_best_single_facet_value(date_created_elements(ignore_approximate))
|
25
|
+
# dateCaptured for web archive seed records
|
26
|
+
result ||= pub_date_best_single_facet_value(@mods_ng_xml.origin_info.dateCaptured.to_a)
|
27
|
+
result
|
28
|
+
end
|
29
|
+
|
30
|
+
# return a single string intended for lexical sorting for pub date
|
31
|
+
# prefer dateIssued (any) before dateCreated (any) before dateCaptured (any)
|
32
|
+
# look for a keyDate and use it if there is one; otherwise pick earliest date
|
33
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
34
|
+
# should be ignored; false if approximate dates should be included
|
35
|
+
# @return [String] single String containing publication year for lexical sorting
|
36
|
+
# note that for string sorting 5 B.C. = -5 => -995; 6 B.C. => -994 so 6 B.C. sorts before 5 B.C.
|
37
|
+
def pub_date_sortable_string(ignore_approximate = false)
|
38
|
+
# prefer dateIssued
|
39
|
+
result = pub_date_best_sort_str_value(date_issued_elements(ignore_approximate))
|
40
|
+
result ||= pub_date_best_sort_str_value(date_created_elements(ignore_approximate))
|
41
|
+
# dateCaptured for web archive seed records
|
42
|
+
result ||= pub_date_best_sort_str_value(@mods_ng_xml.origin_info.dateCaptured.to_a)
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
# given the passed date elements, look for a single keyDate and use it if there is one;
|
47
|
+
# otherwise pick earliest parseable date
|
48
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
49
|
+
# @return [String] single String containing publication year for facet use
|
50
|
+
def pub_date_best_single_facet_value(date_el_array)
|
51
|
+
return if date_el_array.empty?
|
52
|
+
# prefer keyDate
|
53
|
+
key_date_el = self.class.keyDate(date_el_array)
|
54
|
+
result = DateParsing.facet_string_from_date_str(key_date_el.content) if key_date_el
|
55
|
+
return result if result
|
56
|
+
# settle for earliest parseable date
|
57
|
+
_ignore, orig_str_to_parse = self.class.earliest_date(date_el_array)
|
58
|
+
DateParsing.facet_string_from_date_str(orig_str_to_parse) if orig_str_to_parse
|
59
|
+
end
|
60
|
+
|
61
|
+
# given the passed date elements, look for a single keyDate and use it if there is one;
|
62
|
+
# otherwise pick earliest parseable date
|
63
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
64
|
+
# @return [String] single String containing publication year for lexical sorting
|
65
|
+
def pub_date_best_sort_str_value(date_el_array)
|
66
|
+
return if date_el_array.empty?
|
67
|
+
# prefer keyDate
|
68
|
+
key_date_el = self.class.keyDate(date_el_array)
|
69
|
+
result = DateParsing.sortable_year_string_from_date_str(key_date_el.content) if key_date_el
|
70
|
+
return result if result
|
71
|
+
# settle for earliest parseable date
|
72
|
+
sortable_str, _ignore = self.class.earliest_date(date_el_array)
|
73
|
+
sortable_str if sortable_str
|
74
|
+
end
|
75
|
+
|
76
|
+
protected :pub_date_best_single_facet_value, :pub_date_best_sort_str_value
|
77
|
+
|
78
|
+
# return /originInfo/dateCreated elements in MODS records
|
79
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
80
|
+
# should be excluded; false approximate dates should be included
|
81
|
+
# @return [Array<Nokogiri::XML::Element>]
|
82
|
+
def date_created_elements(ignore_approximate=false)
|
83
|
+
date_created_nodeset = @mods_ng_xml.origin_info.dateCreated
|
84
|
+
return self.class.remove_approximate(date_created_nodeset) if ignore_approximate
|
85
|
+
date_created_nodeset.to_a
|
86
|
+
end
|
87
|
+
|
88
|
+
# return /originInfo/dateIssued elements in MODS records
|
89
|
+
# @param [Boolean] ignore_approximate true if approximate dates (per qualifier attribute)
|
90
|
+
# should be excluded; false approximate dates should be included
|
91
|
+
# @return [Array<Nokogiri::XML::Element>]
|
92
|
+
def date_issued_elements(ignore_approximate=false)
|
93
|
+
date_issued_nodeset = @mods_ng_xml.origin_info.dateIssued
|
94
|
+
return self.class.remove_approximate(date_issued_nodeset) if ignore_approximate
|
95
|
+
date_issued_nodeset.to_a
|
96
|
+
end
|
97
|
+
|
98
|
+
# given a set of date elements, return the single element with attribute keyDate="yes"
|
99
|
+
# or return nil if no elements have attribute keyDate="yes", or if multiple elements have keyDate="yes"
|
100
|
+
# @param [Array<Nokogiri::XML::Element>] Array of date elements
|
101
|
+
# @return [Nokogiri::XML::Element, nil] single date element with attribute keyDate="yes", or nil
|
102
|
+
def self.keyDate(elements)
|
103
|
+
keyDates = elements.select { |node| node["keyDate"] == 'yes' }
|
104
|
+
keyDates.first if keyDates.size == 1
|
105
|
+
end
|
106
|
+
|
107
|
+
# remove Elements from NodeSet if they have a qualifier attribute of 'approximate' or 'questionable'
|
108
|
+
# @param [Nokogiri::XML::NodeSet<Nokogiri::XML::Element>] nodeset set of date elements
|
109
|
+
# @return [Array<Nokogiri::XML::Element>] the set of date elements minus any that
|
110
|
+
# had a qualifier attribute of 'approximate' or 'questionable'
|
111
|
+
def self.remove_approximate(nodeset)
|
112
|
+
nodeset.select { |node| node unless date_is_approximate?(node) }
|
113
|
+
end
|
114
|
+
|
115
|
+
# NOTE: legal values for MODS date elements with attribute qualifier are
|
116
|
+
# 'approximate', 'inferred' or 'questionable'
|
117
|
+
# @param [Nokogiri::XML::Element] date_element MODS date element
|
118
|
+
# @return [Boolean] true if date_element has a qualifier attribute of "approximate" or "questionable",
|
119
|
+
# false if no qualifier attribute, or if attribute is 'inferred' or some other value
|
120
|
+
def self.date_is_approximate?(date_element)
|
121
|
+
qualifier = date_element["qualifier"] if date_element.respond_to?('[]')
|
122
|
+
qualifier == 'approximate' || qualifier == 'questionable'
|
123
|
+
end
|
124
|
+
|
125
|
+
# get earliest parseable date from the passed date elements
|
126
|
+
# @param [Array<Nokogiri::XML::Element>] date_el_array the elements from which to select a pub date
|
127
|
+
# @return two String values:
|
128
|
+
# the first is the lexically sortable String value of the earliest date;
|
129
|
+
# the second is the original String value of the chosen element
|
130
|
+
def self.earliest_date(date_el_array)
|
131
|
+
poss_results = {}
|
132
|
+
date_el_array.each { |el|
|
133
|
+
result = DateParsing.sortable_year_string_from_date_str(el.content)
|
134
|
+
poss_results[result] = el.content if result
|
135
|
+
}
|
136
|
+
earliest = poss_results.keys.sort.first if poss_results.present?
|
137
|
+
return earliest, poss_results[earliest] if earliest
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# ---- old date parsing methods used downstream of gem; will be deprecated/replaced with new date parsing methods
|
142
|
+
|
143
|
+
def place
|
144
|
+
vals = self.term_values([:origin_info, :place, :placeTerm])
|
145
|
+
vals
|
146
|
+
end
|
147
|
+
|
148
|
+
# For the date display only, the first place to look is in the dates without encoding=marc array.
|
149
|
+
# If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil
|
150
|
+
# @return [String] value for the pub_date_display Solr field for this document or nil if none
|
151
|
+
def pub_date_display
|
152
|
+
return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
|
153
|
+
return dates_marc_encoding.first unless dates_marc_encoding.empty?
|
154
|
+
nil
|
155
|
+
end
|
156
|
+
|
157
|
+
# For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
|
158
|
+
# If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil
|
159
|
+
# @return [Array<String>] values for the date Solr field for this document or nil if none
|
160
|
+
def pub_dates
|
161
|
+
return dates_marc_encoding unless dates_marc_encoding.empty?
|
162
|
+
return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
166
|
+
# Get the publish year from mods
|
167
|
+
# @return [String] 4 character year or nil if no valid date was found
|
168
|
+
def pub_year
|
169
|
+
# use the cached year if there is one
|
170
|
+
if @pub_year
|
171
|
+
return nil if @pub_year == ''
|
172
|
+
return @pub_year
|
173
|
+
end
|
174
|
+
|
175
|
+
dates = pub_dates
|
176
|
+
if dates
|
177
|
+
pruned_dates = []
|
178
|
+
dates.each do |f_date|
|
179
|
+
# remove ? and []
|
180
|
+
if f_date.length == 4 && f_date.end_with?('?')
|
181
|
+
pruned_dates << f_date.tr('?', '0')
|
182
|
+
else
|
183
|
+
pruned_dates << f_date.delete('?[]')
|
184
|
+
end
|
185
|
+
end
|
186
|
+
# try to find a date starting with the most normal date formats and progressing to more wonky ones
|
187
|
+
@pub_year = get_plain_four_digit_year pruned_dates
|
188
|
+
return @pub_year if @pub_year
|
189
|
+
# Check for years in u notation, e.g., 198u
|
190
|
+
@pub_year = get_u_year pruned_dates
|
191
|
+
return @pub_year if @pub_year
|
192
|
+
@pub_year = get_double_digit_century pruned_dates
|
193
|
+
return @pub_year if @pub_year
|
194
|
+
@pub_year = get_bc_year pruned_dates
|
195
|
+
return @pub_year if @pub_year
|
196
|
+
@pub_year = get_three_digit_year pruned_dates
|
197
|
+
return @pub_year if @pub_year
|
198
|
+
@pub_year = get_single_digit_century pruned_dates
|
199
|
+
return @pub_year if @pub_year
|
200
|
+
end
|
201
|
+
@pub_year = ''
|
202
|
+
nil
|
203
|
+
end
|
204
|
+
|
205
|
+
# creates a date suitable for sorting. Guarnteed to be 4 digits or nil
|
206
|
+
def pub_date_sort
|
207
|
+
if pub_date
|
208
|
+
pd = pub_date
|
209
|
+
pd = '0' + pd if pd.length == 3
|
210
|
+
pd = pd.gsub('--', '00')
|
211
|
+
end
|
212
|
+
fail "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd && pd.length != 4
|
213
|
+
pd
|
214
|
+
end
|
215
|
+
|
216
|
+
# The year the object was published, filtered based on max_pub_date and min_pub_date from the config file
|
217
|
+
# @return [String] 4 character year or nil
|
218
|
+
def pub_date
|
219
|
+
pub_year || nil
|
220
|
+
end
|
221
|
+
|
222
|
+
# Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
|
223
|
+
# @return <Array[String]> with values for the pub date facet
|
224
|
+
def pub_date_facet
|
225
|
+
if pub_date
|
226
|
+
if pub_date.start_with?('-')
|
227
|
+
return (pub_date.to_i + 1000).to_s + ' B.C.'
|
228
|
+
end
|
229
|
+
if pub_date.include? '--'
|
230
|
+
cent = pub_date[0, 2].to_i
|
231
|
+
cent += 1
|
232
|
+
cent = cent.to_s + 'th century'
|
233
|
+
return cent
|
234
|
+
else
|
235
|
+
return pub_date
|
236
|
+
end
|
237
|
+
end
|
238
|
+
nil
|
239
|
+
end
|
240
|
+
|
241
|
+
# ---- old date parsing methods will be deprecated/replaced with new date parsing methods
|
242
|
+
|
243
|
+
protected
|
244
|
+
|
245
|
+
# @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
|
246
|
+
def dates_marc_encoding
|
247
|
+
@dates_marc_encoding ||= begin
|
248
|
+
parse_dates_from_originInfo
|
249
|
+
@dates_marc_encoding
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
|
254
|
+
def dates_no_marc_encoding
|
255
|
+
@dates_no_marc_encoding ||= begin
|
256
|
+
parse_dates_from_originInfo
|
257
|
+
@dates_no_marc_encoding
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
|
262
|
+
# with and without encoding=marc
|
263
|
+
def parse_dates_from_originInfo
|
264
|
+
@dates_marc_encoding = []
|
265
|
+
@dates_no_marc_encoding = []
|
266
|
+
self.origin_info.dateIssued.each { |di|
|
267
|
+
if di.encoding == "marc"
|
268
|
+
@dates_marc_encoding << di.text
|
269
|
+
else
|
270
|
+
@dates_no_marc_encoding << di.text
|
271
|
+
end
|
272
|
+
}
|
273
|
+
self.origin_info.dateCreated.each { |dc|
|
274
|
+
if dc.encoding == "marc"
|
275
|
+
@dates_marc_encoding << dc.text
|
276
|
+
else
|
277
|
+
@dates_no_marc_encoding << dc.text
|
278
|
+
end
|
279
|
+
}
|
280
|
+
end
|
281
|
+
|
282
|
+
|
283
|
+
def is_number?(object)
|
284
|
+
true if Integer(object) rescue false
|
285
|
+
end
|
286
|
+
|
287
|
+
def is_date?(object)
|
288
|
+
true if Date.parse(object) rescue false
|
289
|
+
end
|
290
|
+
|
291
|
+
# TODO: need tests for these methods
|
292
|
+
|
293
|
+
# get a 4 digit year like 1865 from array of dates
|
294
|
+
# @param [Array<String>] dates an array of potential year strings
|
295
|
+
def get_plain_four_digit_year(dates)
|
296
|
+
dates.each do |f_date|
|
297
|
+
matches = f_date.scan(/\d{4}/)
|
298
|
+
if matches.length == 1
|
299
|
+
@pub_year = matches.first
|
300
|
+
return matches.first
|
301
|
+
else
|
302
|
+
# when there are multiple matches, check for ones with CE after them
|
303
|
+
matches.each do |match|
|
304
|
+
# look for things like '1865-6 CE'
|
305
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
306
|
+
pos = pos ? pos.to_i : 0
|
307
|
+
if f_date.include?(match+' CE') or pos > 0
|
308
|
+
@pub_year = match
|
309
|
+
return match
|
310
|
+
end
|
311
|
+
end
|
312
|
+
return matches.first
|
313
|
+
end
|
314
|
+
end
|
315
|
+
nil
|
316
|
+
end
|
317
|
+
|
318
|
+
# get a 3 digit year like 965 from the date array
|
319
|
+
# @param [Array<String>] dates an array of potential year strings
|
320
|
+
def get_three_digit_year(dates)
|
321
|
+
dates.each do |f_date|
|
322
|
+
matches = f_date.scan(/\d{3}/)
|
323
|
+
return matches.first if matches.length > 0
|
324
|
+
end
|
325
|
+
nil
|
326
|
+
end
|
327
|
+
|
328
|
+
# get the 3 digit BC year, return it as a negative, so -700 for 300 BC.
|
329
|
+
# Other methods will translate it to proper display, this is good for sorting.
|
330
|
+
# @param [Array<String>] dates an array of potential year strings
|
331
|
+
def get_bc_year(dates)
|
332
|
+
dates.each do |f_date|
|
333
|
+
matches = f_date.scan(/\d{3} B.C./)
|
334
|
+
if matches.length > 0
|
335
|
+
bc_year = matches.first[0..2]
|
336
|
+
return (bc_year.to_i - 1000).to_s
|
337
|
+
end
|
338
|
+
end
|
339
|
+
nil
|
340
|
+
end
|
341
|
+
|
342
|
+
# get a single digit century like '9th century' from the date array
|
343
|
+
# @param [Array<String>] dates an array of potential year strings
|
344
|
+
# @return [String] y-- if we identify century digit in string
|
345
|
+
def get_single_digit_century(dates)
|
346
|
+
dates.each do |f_date|
|
347
|
+
matches = f_date.scan(/\d{1}th/)
|
348
|
+
next if matches.length == 0
|
349
|
+
if matches.length == 1
|
350
|
+
@pub_year = ((matches.first[0, 2].to_i) - 1).to_s + '--'
|
351
|
+
return @pub_year
|
352
|
+
else
|
353
|
+
# when there are multiple matches, check for ones with CE after them
|
354
|
+
matches.each do |match|
|
355
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
356
|
+
pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
|
357
|
+
pos = pos ? pos.to_i : 0
|
358
|
+
if f_date.include?(match + ' CE') || pos > 0
|
359
|
+
@pub_year = ((match[0, 1].to_i) - 1).to_s + '--'
|
360
|
+
return @pub_year
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
364
|
+
end
|
365
|
+
nil
|
366
|
+
end
|
367
|
+
|
368
|
+
# get a double digit century like '12th century' from the date array
|
369
|
+
# @param [Array<String>] dates an array of potential year strings
|
370
|
+
# @return [String] yy-- if we identify century digits in string
|
371
|
+
def get_double_digit_century(dates)
|
372
|
+
dates.each do |f_date|
|
373
|
+
matches = f_date.scan(/\d{2}th/)
|
374
|
+
next if matches.length == 0
|
375
|
+
if matches.length == 1
|
376
|
+
@pub_year=((matches.first[0, 2].to_i) - 1).to_s + '--'
|
377
|
+
return @pub_year
|
378
|
+
else
|
379
|
+
# when there are multiple matches, check for ones with CE after them
|
380
|
+
matches.each do |match|
|
381
|
+
pos = f_date.index(Regexp.new(match + '...CE'))
|
382
|
+
pos = pos ? pos.to_i : f_date.index(Regexp.new(match + ' century CE'))
|
383
|
+
pos = pos ? pos.to_i : 0
|
384
|
+
if f_date.include?(match+' CE') or pos > 0
|
385
|
+
@pub_year = ((match[0, 2].to_i) - 1).to_s + '--'
|
386
|
+
return @pub_year
|
387
|
+
end
|
388
|
+
end
|
389
|
+
end
|
390
|
+
end
|
391
|
+
nil
|
392
|
+
end
|
393
|
+
|
394
|
+
# If a year has a "u" in it, replace u with 0 for yyyu (becomes yyy0)
|
395
|
+
# and replace u with '-' for yyuu (becomes yy--)
|
396
|
+
# @param [String] dates looking for matches on yyyu or yyuu in these strings
|
397
|
+
# @return [String, nil] String of format yyy0 or yy--, or nil
|
398
|
+
def get_u_year(dates)
|
399
|
+
dates.each do |f_date|
|
400
|
+
# Single digit u notation
|
401
|
+
matches = f_date.scan(/\d{3}u/)
|
402
|
+
return matches.first.tr('u', '0') if matches.length == 1
|
403
|
+
# Double digit u notation
|
404
|
+
matches = f_date.scan(/\d{2}u{2}/)
|
405
|
+
return matches.first.tr('u', '-') if matches.length == 1
|
406
|
+
end
|
407
|
+
nil
|
408
|
+
end
|
409
|
+
end # class Record
|
410
|
+
end
|
411
|
+
end
|