stanford-mods 2.6.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +24 -0
  3. data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
  4. data/lib/stanford-mods/concerns/name.rb +57 -0
  5. data/lib/stanford-mods/concerns/origin_info.rb +113 -0
  6. data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
  7. data/lib/stanford-mods/concerns/searchworks.rb +125 -0
  8. data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
  9. data/lib/stanford-mods/concerns/title.rb +87 -0
  10. data/lib/stanford-mods/coordinate.rb +21 -3
  11. data/lib/stanford-mods/date_parsing.rb +32 -288
  12. data/lib/stanford-mods/imprint.rb +149 -325
  13. data/lib/stanford-mods/record.rb +20 -0
  14. data/lib/stanford-mods/version.rb +1 -1
  15. data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +2 -0
  16. data/lib/stanford-mods.rb +13 -11
  17. data/spec/fixtures/searchworks_imprint_data.rb +38 -39
  18. data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
  19. data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
  20. data/spec/geo_spatial_spec.rb +1 -6
  21. data/spec/imprint_spec.rb +238 -207
  22. data/spec/name_spec.rb +28 -232
  23. data/spec/origin_info_spec.rb +34 -300
  24. data/spec/searchworks_basic_spec.rb +1 -3
  25. data/spec/searchworks_pub_dates_spec.rb +0 -215
  26. data/spec/searchworks_spec.rb +0 -21
  27. data/spec/searchworks_subject_raw_spec.rb +106 -105
  28. data/spec/searchworks_subject_spec.rb +19 -55
  29. data/spec/searchworks_title_spec.rb +5 -5
  30. data/stanford-mods.gemspec +1 -1
  31. metadata +24 -20
  32. data/.travis.yml +0 -17
  33. data/lib/marc_countries.rb +0 -387
  34. data/lib/stanford-mods/geo_utils.rb +0 -28
  35. data/lib/stanford-mods/name.rb +0 -80
  36. data/lib/stanford-mods/origin_info.rb +0 -489
  37. data/lib/stanford-mods/searchworks.rb +0 -333
  38. data/lib/stanford-mods/searchworks_subjects.rb +0 -196
  39. data/spec/date_parsing_spec.rb +0 -905
@@ -1,333 +0,0 @@
1
- # encoding: UTF-8
2
- require 'stanford-mods/searchworks_languages'
3
- require 'stanford-mods/searchworks_subjects'
4
- require 'logger'
5
- require 'mods'
6
-
7
- # SearchWorks specific wranglings of MODS metadata as a mixin to the Stanford::Mods::Record object
8
- module Stanford
9
- module Mods
10
- class Record < ::Mods::Record
11
- attr_writer :druid
12
- attr_writer :logger
13
-
14
- def druid
15
- @druid || 'Unknown item'
16
- end
17
-
18
- def logger
19
- @logger ||= Logger.new(STDOUT)
20
- end
21
- alias sw_logger logger
22
-
23
- # include langagues known to SearchWorks; try to error correct when possible (e.g. when ISO-639 disagrees with MARC standard)
24
- def sw_language_facet
25
- result = []
26
- mods_ng_xml.language.each { |n|
27
- # get languageTerm codes and add their translations to the result
28
- n.code_term.each { |ct|
29
- if ct.authority =~ /^iso639/
30
- vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
31
- vals.each do |v|
32
- if ISO_639.find(v.strip)
33
- iso639_val = ISO_639.find(v.strip).english_name
34
- if SEARCHWORKS_LANGUAGES.has_value?(iso639_val)
35
- result << iso639_val
36
- else
37
- result << SEARCHWORKS_LANGUAGES[v.strip]
38
- end
39
- else
40
- logger.warn "Couldn't find english name for #{ct.text}"
41
- end
42
- end
43
- else
44
- vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
45
- vals.each do |v|
46
- result << SEARCHWORKS_LANGUAGES[v.strip]
47
- end
48
- end
49
- }
50
- # add languageTerm text values
51
- n.text_term.each { |tt|
52
- val = tt.text.strip
53
- result << val if !val.empty? && SEARCHWORKS_LANGUAGES.has_value?(val)
54
- }
55
-
56
- # add language values that aren't in languageTerm subelement
57
- if n.languageTerm.empty?
58
- result << n.text if SEARCHWORKS_LANGUAGES.has_value?(n.text)
59
- end
60
- }
61
- result.uniq
62
- end # language_facet
63
-
64
- # ---- AUTHOR ----
65
-
66
- # @return [String] value for author_1xx_search field
67
- def sw_main_author
68
- main_author_w_date
69
- end
70
-
71
- # @return [Array<String>] values for author_7xx_search field
72
- def sw_addl_authors
73
- additional_authors_w_dates
74
- end
75
-
76
- # @return [Array<String>] values for author_person_facet, author_person_display
77
- def sw_person_authors
78
- personal_names_w_dates
79
- end
80
-
81
- # return the display_value_w_date for all <mods><name> elements that do not have type='personal'
82
- # @return [Array<String>] values for author_other_facet
83
- def sw_impersonal_authors
84
- mods_ng_xml.plain_name.select { |n| n.type_at != 'personal' }.map { |n| n.display_value_w_date }
85
- end
86
-
87
- # @return [Array<String>] values for author_corp_display
88
- def sw_corporate_authors
89
- mods_ng_xml.plain_name.select { |n| n.type_at == 'corporate' }.map { |n| n.display_value_w_date }
90
- end
91
-
92
- # @return [Array<String>] values for author_meeting_display
93
- def sw_meeting_authors
94
- mods_ng_xml.plain_name.select { |n| n.type_at == 'conference' }.map { |n| n.display_value_w_date }
95
- end
96
-
97
- # Returns a sortable version of the main_author:
98
- # main_author + sorting title
99
- # which is the mods approximation of the value created for a marc record
100
- # @return [String] value for author_sort field
101
- def sw_sort_author
102
- # substitute java Character.MAX_CODE_POINT for nil main_author so missing main authors sort last
103
- val = '' + (main_author_w_date ? main_author_w_date : "\u{10FFFF} ") + (sort_title ? sort_title : '')
104
- val.gsub(/[[:punct:]]*/, '').strip
105
- end
106
-
107
- def main_author_w_date_test
108
- result = nil
109
- first_wo_role = nil
110
- plain_name.each { |n|
111
- first_wo_role ||= n if n.role.empty?
112
- n.role.each { |r|
113
- if r.authority.include?('marcrelator') &&
114
- (r.value.include?('Creator') || r.value.include?('Author'))
115
- result ||= n.display_value_w_date
116
- end
117
- }
118
- }
119
- result = first_wo_role.display_value_w_date if !result && first_wo_role
120
- result
121
- end
122
-
123
- # ---- end AUTHOR ----
124
-
125
- # ---- TITLE ----
126
-
127
- # @return [String] value for title_245a_search field
128
- def sw_short_title
129
- short_titles ? short_titles.compact.reject(&:empty?).first : nil
130
- end
131
-
132
- # @return [Nokogiri::XML::NodeSet] title_info nodes, rejecting ones that just have blank text values
133
- def present_title_info_nodes
134
- mods_ng_xml.title_info.reject {|node| node.text.strip.empty?}
135
- end
136
-
137
- # @return [Nokogiri::XML::Node] the first titleInfo node if present, else nil
138
- def first_title_info_node
139
- present_title_info_nodes ? present_title_info_nodes.first : nil
140
- end
141
-
142
- # @return [String] the nonSort text portion of the titleInfo node as a string (if non-empty, else nil)
143
- def nonSort_title
144
- return unless first_title_info_node && first_title_info_node.nonSort
145
-
146
- first_title_info_node.nonSort.text.strip.empty? ? nil : first_title_info_node.nonSort.text.strip
147
- end
148
-
149
- # @return [String] the text of the titleInfo node as a string (if non-empty, else nil)
150
- def title
151
- return unless first_title_info_node && first_title_info_node.title
152
-
153
- first_title_info_node.title.text.strip.empty? ? nil : first_title_info_node.title.text.strip
154
- end
155
-
156
- # Searchworks requires that the MODS has a '//titleInfo/title'
157
- # @return [String] value for title_245_search, title_full_display
158
- def sw_full_title
159
- return nil if !first_title_info_node || !title
160
-
161
- preSubTitle = nonSort_title ? [nonSort_title, title].compact.join(" ") : title
162
- preSubTitle.sub!(/:$/, '')
163
-
164
- subTitle = first_title_info_node.subTitle.text.strip
165
- preParts = subTitle.empty? ? preSubTitle : preSubTitle + " : " + subTitle
166
- preParts.sub!(/\.$/, '') if preParts # remove trailing period
167
-
168
- partName = first_title_info_node.partName.text.strip unless first_title_info_node.partName.text.strip.empty?
169
- partNumber = first_title_info_node.partNumber.text.strip unless first_title_info_node.partNumber.text.strip.empty?
170
- partNumber.sub!(/,$/, '') if partNumber # remove trailing comma
171
- if partNumber && partName
172
- parts = partNumber + ", " + partName
173
- elsif partNumber
174
- parts = partNumber
175
- elsif partName
176
- parts = partName
177
- end
178
- parts.sub!(/\.$/, '') if parts
179
-
180
- result = parts ? preParts + ". " + parts : preParts
181
- return nil unless result
182
-
183
- result += "." unless result =~ /[[:punct:]]$/
184
- result.strip!
185
- result = nil if result.empty?
186
- result
187
- end
188
-
189
- # like sw_full_title without trailing \,/;:.
190
- # spec from solrmarc-sw sw_index.properties
191
- # title_display = custom, removeTrailingPunct(245abdefghijklmnopqrstuvwxyz, [\\\\,/;:], ([A-Za-z]{4}|[0-9]{3}|\\)|\\,))
192
- # @return [String] value for title_display (like title_full_display without trailing punctuation)
193
- def sw_title_display
194
- result = sw_full_title
195
- return nil unless result
196
-
197
- result.sub(/[\.,;:\/\\]+$/, '').strip
198
- end
199
-
200
- # this includes all titles except
201
- # @return [Array<String>] values for title_variant_search
202
- def sw_addl_titles
203
- excluded_title = sw_short_title || sw_title_display
204
- if excluded_title.present?
205
- title_regex = Regexp.new(Regexp.escape(excluded_title))
206
- full_titles.reject { |s| s =~ title_regex }.reject(&:blank?)
207
- else
208
- full_titles.reject(&:blank?)
209
- end
210
- end
211
-
212
- # Returns a sortable version of the main title
213
- # @return [String] value for title_sort field
214
- def sw_sort_title
215
- val = '' + (sw_full_title ? sw_full_title : '')
216
- val.sub!(Regexp.new("^" + Regexp.escape(nonSort_title)), '') if nonSort_title
217
- val.gsub!(/[[:punct:]]*/, '').strip
218
- val.squeeze(" ").strip
219
- end
220
-
221
- # remove trailing commas
222
- # @deprecated in favor of sw_title_display
223
- def sw_full_title_without_commas
224
- result = sw_full_title
225
- result.sub!(/,$/, '') if result
226
- result
227
- end
228
-
229
- # ---- end TITLE ----
230
-
231
- # ---- SUBJECT ----
232
- # see searchworks_subjects.rb
233
- # ---- end SUBJECT ----
234
-
235
- # ---- PUBLICATION (place, year) ----
236
- # see origin_info.rb (as all this information comes from top level originInfo element)
237
- # ---- end PUBLICATION (place, year) ----
238
-
239
- # select one or more format values from the controlled vocabulary per JVine Summer 2014
240
- # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format_main_ssim&rows=0&facet.sort=index
241
- # https://github.com/sul-dlss/stanford-mods/issues/66 - For geodata, the
242
- # resource type should be only Map and not include Software, multimedia.
243
- # @return <Array[String]> value in the SearchWorks controlled vocabulary
244
- def format_main
245
- types = typeOfResource
246
- return [] unless types
247
-
248
- article_genres = ['article', 'Article',
249
- 'book chapter', 'Book chapter', 'Book Chapter',
250
- 'issue brief', 'Issue brief', 'Issue Brief',
251
- 'project report', 'Project report', 'Project Report',
252
- 'student project report', 'Student project report', 'Student Project report', 'Student Project Report',
253
- 'technical report', 'Technical report', 'Technical Report',
254
- 'working paper', 'Working paper', 'Working Paper'
255
- ]
256
- book_genres = ['conference publication', 'Conference publication', 'Conference Publication',
257
- 'instruction', 'Instruction',
258
- 'librettos', 'Librettos',
259
- 'thesis', 'Thesis'
260
- ]
261
- val = []
262
- genres = term_values(:genre) || []
263
- issuance = term_values([:origin_info, :issuance]) || []
264
- frequency = term_values([:origin_info, :frequency]) || []
265
-
266
- val << 'Dataset' if genres.include?('dataset') || genres.include?('Dataset')
267
-
268
- types.each do |type|
269
- val << 'Archive/Manuscript' if type.manuscript == 'yes'
270
-
271
- case type.text
272
- when 'cartographic'
273
- val << 'Map'
274
- when 'mixed material'
275
- val << 'Archive/Manuscript'
276
- when 'moving image'
277
- val << 'Video'
278
- when 'notated music'
279
- val << 'Music score'
280
- when 'software, multimedia'
281
- val << 'Software/Multimedia' unless types.map(&:text).include?('cartographic') || (genres.include?('dataset') || genres.include?('Dataset'))
282
- when 'sound recording-musical'
283
- val << 'Music recording'
284
- when 'sound recording-nonmusical', 'sound recording'
285
- val << 'Sound recording'
286
- when 'still image'
287
- val << 'Image'
288
- when 'text'
289
- is_explicitly_a_book = type.manuscript != 'yes' && (issuance.include?('monographic') || !(genres & article_genres).empty? || !(genres & book_genres).empty?)
290
- is_periodical = issuance.include?('continuing') || issuance.include?('serial') || frequency.any? { |x| !x.empty? }
291
- is_archived_website = genres.any? { |x| x.casecmp('archived website') == 0 }
292
-
293
- val << 'Book' if is_explicitly_a_book
294
- val << 'Journal/Periodical' if is_periodical
295
- val << 'Archived website' if is_archived_website
296
- val << 'Book' unless is_explicitly_a_book || is_periodical || is_archived_website
297
- when 'three dimensional object'
298
- val << 'Object'
299
- end
300
- end
301
- val.uniq
302
- end
303
-
304
- # @return <Array[String]> values for the genre facet in SearchWorks
305
- def sw_genre
306
- genres = term_values(:genre)
307
- return [] unless genres
308
-
309
- val = genres.map(&:to_s)
310
- thesis_pub = ['thesis', 'Thesis']
311
- val << 'Thesis/Dissertation' if (genres & thesis_pub).any?
312
-
313
- conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
314
- gov_pub = ['government publication', 'Government publication', 'Government Publication']
315
- tech_rpt = ['technical report', 'Technical report', 'Technical Report']
316
-
317
- val << 'Conference proceedings' if (genres & conf_pub).any?
318
- val << 'Government document' if (genres & gov_pub).any?
319
- val << 'Technical report' if (genres & tech_rpt).any?
320
-
321
- val.uniq
322
- end
323
-
324
- # @return [String] value with the numeric catkey in it, or nil if none exists
325
- def catkey
326
- catkey = term_values([:record_info, :recordIdentifier])
327
- return nil unless catkey && !catkey.empty?
328
-
329
- catkey.first.tr('a', '') # ensure catkey is numeric only
330
- end
331
- end # class Record
332
- end # Module Mods
333
- end # Module Stanford
@@ -1,196 +0,0 @@
1
- # encoding: UTF-8
2
- require 'logger'
3
- require 'mods'
4
-
5
- # SearchWorks specific wranglings of MODS *subject* metadata as a mixin to the Stanford::Mods::Record object
6
- module Stanford
7
- module Mods
8
- class Record < ::Mods::Record
9
- # Values are the contents of:
10
- # subject/geographic
11
- # subject/hierarchicalGeographic
12
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
13
- # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
14
- # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
15
- def sw_geographic_search(sep = ' ')
16
- result = term_values([:subject, :geographic]) || []
17
-
18
- # hierarchicalGeographic has sub elements
19
- mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
20
- hg_vals = hg_node.element_children.map(&:text).reject(&:empty?)
21
- result << hg_vals.join(sep) unless hg_vals.empty?
22
- }
23
-
24
- trans_code_vals = mods_ng_xml.subject.geographicCode.translated_value || []
25
- trans_code_vals.each { |val|
26
- result << val unless result.include?(val)
27
- }
28
- result
29
- end
30
-
31
- # Values are the contents of:
32
- # subject/name/namePart
33
- # "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
34
- # @param [String] sep - the separator string for joining namePart sub elements
35
- # @return [Array<String>] values for names inside subject elements or [] if none
36
- def sw_subject_names(sep = ', ')
37
- mods_ng_xml.subject.name_el
38
- .select { |n_el| n_el.namePart }
39
- .map { |name_el_w_np| name_el_w_np.namePart.map(&:text).reject(&:empty?) }
40
- .reject(&:empty?)
41
- .map { |parts| parts.join(sep).strip }
42
- end
43
-
44
- # Values are the contents of:
45
- # subject/titleInfo/(subelements)
46
- # @param [String] sep - the separator string for joining titleInfo sub elements
47
- # @return [Array<String>] values for titles inside subject elements or [] if none
48
- def sw_subject_titles(sep = ' ')
49
- result = []
50
- mods_ng_xml.subject.titleInfo.each { |ti_el|
51
- parts = ti_el.element_children.map(&:text).reject(&:empty?)
52
- result << parts.join(sep).strip unless parts.empty?
53
- }
54
- result
55
- end
56
-
57
- # Values are the contents of:
58
- # mods/subject/topic
59
- # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
60
- def topic_search
61
- @topic_search ||= begin
62
- vals = []
63
- vals.concat(subject_topics) if subject_topics
64
- vals.empty? ? nil : vals
65
- end
66
- end
67
-
68
- # Values are the contents of:
69
- # subject/topic
70
- # subject/name
71
- # subject/title
72
- # subject/occupation
73
- # with trailing comma, semicolon, and backslash (and any preceding spaces) removed
74
- # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
75
- def topic_facet
76
- vals = subject_topics ? Array.new(subject_topics) : []
77
- vals.concat(subject_names) if subject_names
78
- vals.concat(subject_titles) if subject_titles
79
- vals.concat(subject_occupations) if subject_occupations
80
- vals.map! { |val| val.sub(/[\\,;]$/, '').strip }
81
- vals.empty? ? nil : vals
82
- end
83
-
84
- # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
85
- # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
86
- def geographic_facet
87
- geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } if geographic_search
88
- end
89
-
90
- # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
91
- # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
92
- def era_facet
93
- subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } if subject_temporal
94
- end
95
-
96
- # Values are the contents of:
97
- # subject/geographic
98
- # subject/hierarchicalGeographic
99
- # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields)
100
- # @return [Array<String>] values for the geographic_search Solr field for this document or nil if none
101
- def geographic_search
102
- @geographic_search ||= begin
103
- result = sw_geographic_search
104
-
105
- # TODO: this should go into stanford-mods ... but then we have to set that gem up with a Logger
106
- # print a message for any unrecognized encodings
107
- xvals = subject.geographicCode.translated_value
108
- codes = term_values([:subject, :geographicCode])
109
- if codes && codes.size > xvals.size
110
- subject.geographicCode.each { |n|
111
- next unless n.authority != 'marcgac' && n.authority != 'marccountry'
112
-
113
- sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
114
- }
115
- end
116
-
117
- # FIXME: stanford-mods should be returning [], not nil ...
118
- return nil if !result || result.empty?
119
-
120
- result
121
- end
122
- end
123
-
124
- # Values are the contents of:
125
- # subject/name
126
- # subject/occupation - no subelements
127
- # subject/titleInfo
128
- # @return [Array<String>] values for the subject_other_search Solr field for this document or nil if none
129
- def subject_other_search
130
- @subject_other_search ||= begin
131
- vals = subject_occupations ? Array.new(subject_occupations) : []
132
- vals.concat(subject_names) if subject_names
133
- vals.concat(subject_titles) if subject_titles
134
- vals.empty? ? nil : vals
135
- end
136
- end
137
-
138
- # Values are the contents of:
139
- # subject/temporal
140
- # subject/genre
141
- # @return [Array<String>] values for the subject_other_subvy_search Solr field for this document or nil if none
142
- def subject_other_subvy_search
143
- @subject_other_subvy_search ||= begin
144
- vals = subject_temporal ? Array.new(subject_temporal) : []
145
- gvals = term_values([:subject, :genre])
146
- vals.concat(gvals) if gvals
147
-
148
- # print a message for any temporal encodings
149
- subject.temporal.each { |n|
150
- sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") unless n.encoding.empty?
151
- }
152
-
153
- vals.empty? ? nil : vals
154
- end
155
- end
156
-
157
- # Values are the contents of:
158
- # all subject subelements except subject/cartographic plus genre top level element
159
- # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
160
- def subject_all_search
161
- vals = topic_search ? Array.new(topic_search) : []
162
- vals.concat(geographic_search) if geographic_search
163
- vals.concat(subject_other_search) if subject_other_search
164
- vals.concat(subject_other_subvy_search) if subject_other_subvy_search
165
- vals.empty? ? nil : vals
166
- end
167
-
168
- protected #----------------------------------------------------------
169
-
170
- # convenience method for subject/name/namePart values (to avoid parsing the mods for the same thing multiple times)
171
- def subject_names
172
- @subject_names ||= sw_subject_names
173
- end
174
-
175
- # convenience method for subject/occupation values (to avoid parsing the mods for the same thing multiple times)
176
- def subject_occupations
177
- @subject_occupations ||= term_values([:subject, :occupation])
178
- end
179
-
180
- # convenience method for subject/temporal values (to avoid parsing the mods for the same thing multiple times)
181
- def subject_temporal
182
- @subject_temporal ||= term_values([:subject, :temporal])
183
- end
184
-
185
- # convenience method for subject/titleInfo values (to avoid parsing the mods for the same thing multiple times)
186
- def subject_titles
187
- @subject_titles ||= sw_subject_titles
188
- end
189
-
190
- # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
191
- def subject_topics
192
- @subject_topics ||= term_values([:subject, :topic])
193
- end
194
- end
195
- end
196
- end