stanford-mods 2.6.4 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +1 -1
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +24 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -289
  12. data/lib/stanford-mods/imprint.rb +170 -322
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
  16. data/lib/stanford-mods.rb +12 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +263 -207
  22. data/spec/lib/stanford-mods/coordinate_spec.rb +3 -5
  23. data/spec/name_spec.rb +26 -230
  24. data/spec/origin_info_spec.rb +34 -300
  25. data/spec/searchworks_basic_spec.rb +1 -3
  26. data/spec/searchworks_pub_dates_spec.rb +0 -215
  27. data/spec/searchworks_spec.rb +0 -21
  28. data/spec/searchworks_subject_raw_spec.rb +106 -105
  29. data/spec/searchworks_subject_spec.rb +19 -55
  30. data/spec/searchworks_title_spec.rb +5 -5
  31. data/stanford-mods.gemspec +1 -1
  32. metadata +19 -15
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,333 +0,0 @@
1
- # encoding: UTF-8
2
- require 'stanford-mods/searchworks_languages'
3
- require 'stanford-mods/searchworks_subjects'
4
- require 'logger'
5
- require 'mods'
6
-
7
- # SearchWorks specific wranglings of MODS metadata as a mixin to the Stanford::Mods::Record object
8
- module Stanford
9
- module Mods
10
- class Record < ::Mods::Record
11
- attr_writer :druid
12
- attr_writer :logger
13
-
14
- def druid
15
- @druid || 'Unknown item'
16
- end
17
-
18
- def logger
19
- @logger ||= Logger.new(STDOUT)
20
- end
21
- alias sw_logger logger
22
-
23
- # include langagues known to SearchWorks; try to error correct when possible (e.g. when ISO-639 disagrees with MARC standard)
24
- def sw_language_facet
25
- result = []
26
- mods_ng_xml.language.each { |n|
27
- # get languageTerm codes and add their translations to the result
28
- n.code_term.each { |ct|
29
- if ct.authority =~ /^iso639/
30
- vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
31
- vals.each do |v|
32
- if ISO_639.find(v.strip)
33
- iso639_val = ISO_639.find(v.strip).english_name
34
- if SEARCHWORKS_LANGUAGES.has_value?(iso639_val)
35
- result << iso639_val
36
- else
37
- result << SEARCHWORKS_LANGUAGES[v.strip]
38
- end
39
- else
40
- logger.warn "Couldn't find english name for #{ct.text}"
41
- end
42
- end
43
- else
44
- vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
45
- vals.each do |v|
46
- result << SEARCHWORKS_LANGUAGES[v.strip]
47
- end
48
- end
49
- }
50
- # add languageTerm text values
51
- n.text_term.each { |tt|
52
- val = tt.text.strip
53
- result << val if !val.empty? && SEARCHWORKS_LANGUAGES.has_value?(val)
54
- }
55
-
56
- # add language values that aren't in languageTerm subelement
57
- if n.languageTerm.empty?
58
- result << n.text if SEARCHWORKS_LANGUAGES.has_value?(n.text)
59
- end
60
- }
61
- result.uniq
62
- end # language_facet
63
-
64
- # ---- AUTHOR ----
65
-
66
- # @return [String] value for author_1xx_search field
67
- def sw_main_author
68
- main_author_w_date
69
- end
70
-
71
- # @return [Array<String>] values for author_7xx_search field
72
- def sw_addl_authors
73
- additional_authors_w_dates
74
- end
75
-
76
- # @return [Array<String>] values for author_person_facet, author_person_display
77
- def sw_person_authors
78
- personal_names_w_dates
79
- end
80
-
81
- # return the display_value_w_date for all <mods><name> elements that do not have type='personal'
82
- # @return [Array<String>] values for author_other_facet
83
- def sw_impersonal_authors
84
- mods_ng_xml.plain_name.select { |n| n.type_at != 'personal' }.map { |n| n.display_value_w_date }
85
- end
86
-
87
- # @return [Array<String>] values for author_corp_display
88
- def sw_corporate_authors
89
- mods_ng_xml.plain_name.select { |n| n.type_at == 'corporate' }.map { |n| n.display_value_w_date }
90
- end
91
-
92
- # @return [Array<String>] values for author_meeting_display
93
- def sw_meeting_authors
94
- mods_ng_xml.plain_name.select { |n| n.type_at == 'conference' }.map { |n| n.display_value_w_date }
95
- end
96
-
97
- # Returns a sortable version of the main_author:
98
- # main_author + sorting title
99
- # which is the mods approximation of the value created for a marc record
100
- # @return [String] value for author_sort field
101
- def sw_sort_author
102
- # substitute java Character.MAX_CODE_POINT for nil main_author so missing main authors sort last
103
- val = '' + (main_author_w_date ? main_author_w_date : "\u{10FFFF} ") + (sort_title ? sort_title : '')
104
- val.gsub(/[[:punct:]]*/, '').strip
105
- end
106
-
107
- def main_author_w_date_test
108
- result = nil
109
- first_wo_role = nil
110
- plain_name.each { |n|
111
- first_wo_role ||= n if n.role.empty?
112
- n.role.each { |r|
113
- if r.authority.include?('marcrelator') &&
114
- (r.value.include?('Creator') || r.value.include?('Author'))
115
- result ||= n.display_value_w_date
116
- end
117
- }
118
- }
119
- result = first_wo_role.display_value_w_date if !result && first_wo_role
120
- result
121
- end
122
-
123
- # ---- end AUTHOR ----
124
-
125
- # ---- TITLE ----
126
-
127
- # @return [String] value for title_245a_search field
128
- def sw_short_title
129
- short_titles ? short_titles.compact.reject(&:empty?).first : nil
130
- end
131
-
132
- # @return [Nokogiri::XML::NodeSet] title_info nodes, rejecting ones that just have blank text values
133
- def present_title_info_nodes
134
- mods_ng_xml.title_info.reject {|node| node.text.strip.empty?}
135
- end
136
-
137
- # @return [Nokogiri::XML::Node] the first titleInfo node if present, else nil
138
- def first_title_info_node
139
- present_title_info_nodes ? present_title_info_nodes.first : nil
140
- end
141
-
142
- # @return [String] the nonSort text portion of the titleInfo node as a string (if non-empty, else nil)
143
- def nonSort_title
144
- return unless first_title_info_node && first_title_info_node.nonSort
145
-
146
- first_title_info_node.nonSort.text.strip.empty? ? nil : first_title_info_node.nonSort.text.strip
147
- end
148
-
149
- # @return [String] the text of the titleInfo node as a string (if non-empty, else nil)
150
- def title
151
- return unless first_title_info_node && first_title_info_node.title
152
-
153
- first_title_info_node.title.text.strip.empty? ? nil : first_title_info_node.title.text.strip
154
- end
155
-
156
- # Searchworks requires that the MODS has a '//titleInfo/title'
157
- # @return [String] value for title_245_search, title_full_display
158
- def sw_full_title
159
- return nil if !first_title_info_node || !title
160
-
161
- preSubTitle = nonSort_title ? [nonSort_title, title].compact.join(" ") : title
162
- preSubTitle.sub!(/:$/, '')
163
-
164
- subTitle = first_title_info_node.subTitle.text.strip
165
- preParts = subTitle.empty? ? preSubTitle : preSubTitle + " : " + subTitle
166
- preParts.sub!(/\.$/, '') if preParts # remove trailing period
167
-
168
- partName = first_title_info_node.partName.text.strip unless first_title_info_node.partName.text.strip.empty?
169
- partNumber = first_title_info_node.partNumber.text.strip unless first_title_info_node.partNumber.text.strip.empty?
170
- partNumber.sub!(/,$/, '') if partNumber # remove trailing comma
171
- if partNumber && partName
172
- parts = partNumber + ", " + partName
173
- elsif partNumber
174
- parts = partNumber
175
- elsif partName
176
- parts = partName
177
- end
178
- parts.sub!(/\.$/, '') if parts
179
-
180
- result = parts ? preParts + ". " + parts : preParts
181
- return nil unless result
182
-
183
- result += "." unless result =~ /[[:punct:]]$/
184
- result.strip!
185
- result = nil if result.empty?
186
- result
187
- end
188
-
189
- # like sw_full_title without trailing \,/;:.
190
- # spec from solrmarc-sw sw_index.properties
191
- # title_display = custom, removeTrailingPunct(245abdefghijklmnopqrstuvwxyz, [\\\\,/;:], ([A-Za-z]{4}|[0-9]{3}|\\)|\\,))
192
- # @return [String] value for title_display (like title_full_display without trailing punctuation)
193
- def sw_title_display
194
- result = sw_full_title
195
- return nil unless result
196
-
197
- result.sub(/[\.,;:\/\\]+$/, '').strip
198
- end
199
-
200
- # this includes all titles except
201
- # @return [Array<String>] values for title_variant_search
202
- def sw_addl_titles
203
- excluded_title = sw_short_title || sw_title_display
204
- if excluded_title.present?
205
- title_regex = Regexp.new(Regexp.escape(excluded_title))
206
- full_titles.reject { |s| s =~ title_regex }.reject(&:blank?)
207
- else
208
- full_titles.reject(&:blank?)
209
- end
210
- end
211
-
212
- # Returns a sortable version of the main title
213
- # @return [String] value for title_sort field
214
- def sw_sort_title
215
- val = '' + (sw_full_title ? sw_full_title : '')
216
- val.sub!(Regexp.new("^" + Regexp.escape(nonSort_title)), '') if nonSort_title
217
- val.gsub!(/[[:punct:]]*/, '').strip
218
- val.squeeze(" ").strip
219
- end
220
-
221
- # remove trailing commas
222
- # @deprecated in favor of sw_title_display
223
- def sw_full_title_without_commas
224
- result = sw_full_title
225
- result.sub!(/,$/, '') if result
226
- result
227
- end
228
-
229
- # ---- end TITLE ----
230
-
231
- # ---- SUBJECT ----
232
- # see searchworks_subjects.rb
233
- # ---- end SUBJECT ----
234
-
235
- # ---- PUBLICATION (place, year) ----
236
- # see origin_info.rb (as all this information comes from top level originInfo element)
237
- # ---- end PUBLICATION (place, year) ----
238
-
239
- # select one or more format values from the controlled vocabulary per JVine Summer 2014
240
- # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format_main_ssim&rows=0&facet.sort=index
241
- # https://github.com/sul-dlss/stanford-mods/issues/66 - For geodata, the
242
- # resource type should be only Map and not include Software, multimedia.
243
- # @return <Array[String]> value in the SearchWorks controlled vocabulary
244
- def format_main
245
- types = typeOfResource
246
- return [] unless types
247
-
248
- article_genres = ['article', 'Article',
249
- 'book chapter', 'Book chapter', 'Book Chapter',
250
- 'issue brief', 'Issue brief', 'Issue Brief',
251
- 'project report', 'Project report', 'Project Report',
252
- 'student project report', 'Student project report', 'Student Project report', 'Student Project Report',
253
- 'technical report', 'Technical report', 'Technical Report',
254
- 'working paper', 'Working paper', 'Working Paper'
255
- ]
256
- book_genres = ['conference publication', 'Conference publication', 'Conference Publication',
257
- 'instruction', 'Instruction',
258
- 'librettos', 'Librettos',
259
- 'thesis', 'Thesis'
260
- ]
261
- val = []
262
- genres = term_values(:genre) || []
263
- issuance = term_values([:origin_info, :issuance]) || []
264
- frequency = term_values([:origin_info, :frequency]) || []
265
-
266
- val << 'Dataset' if genres.include?('dataset') || genres.include?('Dataset')
267
-
268
- types.each do |type|
269
- val << 'Archive/Manuscript' if type.manuscript == 'yes'
270
-
271
- case type.text
272
- when 'cartographic'
273
- val << 'Map'
274
- when 'mixed material'
275
- val << 'Archive/Manuscript'
276
- when 'moving image'
277
- val << 'Video'
278
- when 'notated music'
279
- val << 'Music score'
280
- when 'software, multimedia'
281
- val << 'Software/Multimedia' unless types.map(&:text).include?('cartographic') || (genres.include?('dataset') || genres.include?('Dataset'))
282
- when 'sound recording-musical'
283
- val << 'Music recording'
284
- when 'sound recording-nonmusical', 'sound recording'
285
- val << 'Sound recording'
286
- when 'still image'
287
- val << 'Image'
288
- when 'text'
289
- is_explicitly_a_book = type.manuscript != 'yes' && (issuance.include?('monographic') || !(genres & article_genres).empty? || !(genres & book_genres).empty?)
290
- is_periodical = issuance.include?('continuing') || issuance.include?('serial') || frequency.any? { |x| !x.empty? }
291
- is_archived_website = genres.any? { |x| x.casecmp('archived website') == 0 }
292
-
293
- val << 'Book' if is_explicitly_a_book
294
- val << 'Journal/Periodical' if is_periodical
295
- val << 'Archived website' if is_archived_website
296
- val << 'Book' unless is_explicitly_a_book || is_periodical || is_archived_website
297
- when 'three dimensional object'
298
- val << 'Object'
299
- end
300
- end
301
- val.uniq
302
- end
303
-
304
- # @return <Array[String]> values for the genre facet in SearchWorks
305
- def sw_genre
306
- genres = term_values(:genre)
307
- return [] unless genres
308
-
309
- val = genres.map(&:to_s)
310
- thesis_pub = ['thesis', 'Thesis']
311
- val << 'Thesis/Dissertation' if (genres & thesis_pub).any?
312
-
313
- conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
314
- gov_pub = ['government publication', 'Government publication', 'Government Publication']
315
- tech_rpt = ['technical report', 'Technical report', 'Technical Report']
316
-
317
- val << 'Conference proceedings' if (genres & conf_pub).any?
318
- val << 'Government document' if (genres & gov_pub).any?
319
- val << 'Technical report' if (genres & tech_rpt).any?
320
-
321
- val.uniq
322
- end
323
-
324
- # @return [String] value with the numeric catkey in it, or nil if none exists
325
- def catkey
326
- catkey = term_values([:record_info, :recordIdentifier])
327
- return nil unless catkey && !catkey.empty?
328
-
329
- catkey.first.tr('a', '') # ensure catkey is numeric only
330
- end
331
- end # class Record
332
- end # Module Mods
333
- end # Module Stanford
@@ -1,196 +0,0 @@
1
- # encoding: UTF-8
2
- require 'logger'
3
- require 'mods'
4
-
5
- # SearchWorks specific wranglings of MODS *subject* metadata as a mixin to the Stanford::Mods::Record object
6
- module Stanford
7
- module Mods
8
- class Record < ::Mods::Record
9
- # Values are the contents of:
10
- # subject/geographic
11
- # subject/hierarchicalGeographic
12
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
13
- # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
14
- # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
15
- def sw_geographic_search(sep = ' ')
16
- result = term_values([:subject, :geographic]) || []
17
-
18
- # hierarchicalGeographic has sub elements
19
- mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
20
- hg_vals = hg_node.element_children.map(&:text).reject(&:empty?)
21
- result << hg_vals.join(sep) unless hg_vals.empty?
22
- }
23
-
24
- trans_code_vals = mods_ng_xml.subject.geographicCode.translated_value || []
25
- trans_code_vals.each { |val|
26
- result << val unless result.include?(val)
27
- }
28
- result
29
- end
30
-
31
- # Values are the contents of:
32
- # subject/name/namePart
33
- # "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
34
- # @param [String] sep - the separator string for joining namePart sub elements
35
- # @return [Array<String>] values for names inside subject elements or [] if none
36
- def sw_subject_names(sep = ', ')
37
- mods_ng_xml.subject.name_el
38
- .select { |n_el| n_el.namePart }
39
- .map { |name_el_w_np| name_el_w_np.namePart.map(&:text).reject(&:empty?) }
40
- .reject(&:empty?)
41
- .map { |parts| parts.join(sep).strip }
42
- end
43
-
44
- # Values are the contents of:
45
- # subject/titleInfo/(subelements)
46
- # @param [String] sep - the separator string for joining titleInfo sub elements
47
- # @return [Array<String>] values for titles inside subject elements or [] if none
48
- def sw_subject_titles(sep = ' ')
49
- result = []
50
- mods_ng_xml.subject.titleInfo.each { |ti_el|
51
- parts = ti_el.element_children.map(&:text).reject(&:empty?)
52
- result << parts.join(sep).strip unless parts.empty?
53
- }
54
- result
55
- end
56
-
57
- # Values are the contents of:
58
- # mods/subject/topic
59
- # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
60
- def topic_search
61
- @topic_search ||= begin
62
- vals = []
63
- vals.concat(subject_topics) if subject_topics
64
- vals.empty? ? nil : vals
65
- end
66
- end
67
-
68
- # Values are the contents of:
69
- # subject/topic
70
- # subject/name
71
- # subject/title
72
- # subject/occupation
73
- # with trailing comma, semicolon, and backslash (and any preceding spaces) removed
74
- # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
75
- def topic_facet
76
- vals = subject_topics ? Array.new(subject_topics) : []
77
- vals.concat(subject_names) if subject_names
78
- vals.concat(subject_titles) if subject_titles
79
- vals.concat(subject_occupations) if subject_occupations
80
- vals.map! { |val| val.sub(/[\\,;]$/, '').strip }
81
- vals.empty? ? nil : vals
82
- end
83
-
84
- # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
85
- # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
86
- def geographic_facet
87
- geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } if geographic_search
88
- end
89
-
90
- # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
91
- # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
92
- def era_facet
93
- subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } if subject_temporal
94
- end
95
-
96
- # Values are the contents of:
97
- # subject/geographic
98
- # subject/hierarchicalGeographic
99
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
100
- # @return [Array<String>] values for the geographic_search Solr field for this document or nil if none
101
- def geographic_search
102
- @geographic_search ||= begin
103
- result = sw_geographic_search
104
-
105
- # TODO: this should go into stanford-mods ... but then we have to set that gem up with a Logger
106
- # print a message for any unrecognized encodings
107
- xvals = subject.geographicCode.translated_value
108
- codes = term_values([:subject, :geographicCode])
109
- if codes && codes.size > xvals.size
110
- subject.geographicCode.each { |n|
111
- next unless n.authority != 'marcgac' && n.authority != 'marccountry'
112
-
113
- sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
114
- }
115
- end
116
-
117
- # FIXME: stanford-mods should be returning [], not nil ...
118
- return nil if !result || result.empty?
119
-
120
- result
121
- end
122
- end
123
-
124
- # Values are the contents of:
125
- # subject/name
126
- # subject/occupation - no subelements
127
- # subject/titleInfo
128
- # @return [Array<String>] values for the subject_other_search Solr field for this document or nil if none
129
- def subject_other_search
130
- @subject_other_search ||= begin
131
- vals = subject_occupations ? Array.new(subject_occupations) : []
132
- vals.concat(subject_names) if subject_names
133
- vals.concat(subject_titles) if subject_titles
134
- vals.empty? ? nil : vals
135
- end
136
- end
137
-
138
- # Values are the contents of:
139
- # subject/temporal
140
- # subject/genre
141
- # @return [Array<String>] values for the subject_other_subvy_search Solr field for this document or nil if none
142
- def subject_other_subvy_search
143
- @subject_other_subvy_search ||= begin
144
- vals = subject_temporal ? Array.new(subject_temporal) : []
145
- gvals = term_values([:subject, :genre])
146
- vals.concat(gvals) if gvals
147
-
148
- # print a message for any temporal encodings
149
- subject.temporal.each { |n|
150
- sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") unless n.encoding.empty?
151
- }
152
-
153
- vals.empty? ? nil : vals
154
- end
155
- end
156
-
157
- # Values are the contents of:
158
- # all subject subelements except subject/cartographic plus genre top level element
159
- # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
160
- def subject_all_search
161
- vals = topic_search ? Array.new(topic_search) : []
162
- vals.concat(geographic_search) if geographic_search
163
- vals.concat(subject_other_search) if subject_other_search
164
- vals.concat(subject_other_subvy_search) if subject_other_subvy_search
165
- vals.empty? ? nil : vals
166
- end
167
-
168
- protected #----------------------------------------------------------
169
-
170
- # convenience method for subject/name/namePart values (to avoid parsing the mods for the same thing multiple times)
171
- def subject_names
172
- @subject_names ||= sw_subject_names
173
- end
174
-
175
- # convenience method for subject/occupation values (to avoid parsing the mods for the same thing multiple times)
176
- def subject_occupations
177
- @subject_occupations ||= term_values([:subject, :occupation])
178
- end
179
-
180
- # convenience method for subject/temporal values (to avoid parsing the mods for the same thing multiple times)
181
- def subject_temporal
182
- @subject_temporal ||= term_values([:subject, :temporal])
183
- end
184
-
185
- # convenience method for subject/titleInfo values (to avoid parsing the mods for the same thing multiple times)
186
- def subject_titles
187
- @subject_titles ||= sw_subject_titles
188
- end
189
-
190
- # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
191
- def subject_topics
192
- @subject_topics ||= term_values([:subject, :topic])
193
- end
194
- end
195
- end
196
- end