ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,1172 @@
1
+ require 'csv'
2
+
3
+ module DS
4
+ module Extractor
5
+ module MarcXmlExtractor
6
+ module ClassMethods
7
+
8
+ ############################################################
9
+ # NAMES
10
+ ############################################################
11
+
12
+ ##
13
+ # Extract names from record using tags and relators. Tags understood are +100+,
14
+ # +700+, and +710+. The +relators+ are used to require datafields based on the
15
+ # contents of a subfield code +e+ containing the specified value, like 'scribe':
16
+ #
17
+ # contains(./subfield[@code ='e'], 'scribe')
18
+ #
19
+ # @see #build_name_query for details on query construction
20
+ #
21
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
22
+ # @param [Array<String>] tags the MARC field tag[s]
23
+ # @param [Array<String>] relators for +700$e+, +710$e+, a value[s] like 'former owner'
24
+ # @return [String] pipe-separated list of names
25
+ def extract_names_as_recorded record, tags: [], relators: []
26
+ xpath = build_name_query tags: tags, relators: relators
27
+ return '' if xpath.empty? # don't process nonsensical requests
28
+ record.xpath(xpath).map { |datafield| DS::Util.clean_string extract_name_portion datafield }
29
+ end
30
+
31
+ ##
32
+ # Extract names from record using tags and relators. Authors are extracted
33
+ # from datafields 100, 110, 111, 700, 701, and 711.
34
+ #
35
+ # All 1xx are extracted, no relator is assumed and all 1xx are assumed to
36
+ # be authors.
37
+ #
38
+ # 700, 710, and 711 are extracted when subfield 7xx$e contains 'author'.
39
+ #
40
+ # @see #build_name_query for details on query construction
41
+ #
42
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
43
+ # @return [Array<String>] list of names
44
+ def extract_authors_as_recorded record
45
+ authors = []
46
+ authors += extract_names_as_recorded record, tags: [100, 110, 111]
47
+ authors += extract_names_as_recorded record, tags: [700, 710, 711], relators: %w{author}
48
+ authors
49
+ end
50
+
51
+ # Extract the alternate graphical representation of the name or return +[]+.
52
+ #
53
+ # See MARC specification for 880 fields:
54
+ #
55
+ # * https://www.loc.gov/marc/bibliographic/bd880.html
56
+ #
57
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
58
+ # @return [Array<String>] list of names or +[]+
59
+ def extract_authors_as_recorded_agr record
60
+ authors = []
61
+ authors += extract_names_as_recorded_agr record, tags: [100, 110, 111]
62
+ authors += extract_names_as_recorded_agr record, tags: [700, 710, 711], relators: %w{author}
63
+ authors
64
+ end
65
+
66
+ # Extract scribes from the given record.
67
+ #
68
+ # @param record [Nokogiri::XML:Node] the record to extract scribes from
69
+ # @return [Array<DS::Extractor::Name>] the extracted scribes
70
+ def extract_scribes record
71
+ extract_names(
72
+ record, tags: [700, 710, 711], relators: ['scribe']
73
+ )
74
+ end
75
+
76
+ # Extract scribes as recorded from the given record.
77
+ #
78
+ # @param [Nokogiri::XML:Node] record the record to extract scribes from
79
+ # @return [Array<String>] the extracted scribes as recorded
80
+ def extract_scribes_as_recorded record
81
+ extract_scribes(record).map &:as_recorded
82
+ end
83
+
84
+ # Extract scribes as recorded with vernacular form from the given record.
85
+ #
86
+ # @param [Nokogiri::XML:Node] record the record to extract scribes from
87
+ # @return [Array<String>] the extracted scribes as recorded
88
+ def extract_scribes_as_recorded_agr record
89
+ extract_scribes(record).map &:vernacular
90
+ end
91
+
92
+ # Extracts artists from the given record using the specified type and role.
93
+ #
94
+ # @param [Nokogiri::XML:Node] record the record to extract artists from
95
+ # @return [Array<DS::Extractor::Name>] an array of extracted artists
96
+ def extract_artists record
97
+ extract_names(
98
+ record, tags: [700, 710, 711],
99
+ relators: ['artist', 'illuminator']
100
+ )
101
+ end
102
+
103
+ # Extracts artists as recorded from the given record.
104
+ #
105
+ # @param [Nokogiri::XML:Node] record the record to extract artists from
106
+ # @return [Array<String>] the extracted artists as recorded
107
+ def extract_artists_as_recorded record
108
+ extract_artists(record).map &:as_recorded
109
+ end
110
+
111
+ # Extracts artists as recorded with vernacular form from the given record.
112
+ #
113
+ # @param [Nokogiri::XML:Node] record the record to extract artists from
114
+ # @return [Array<String>] the extracted artists as recorded with vernacular form
115
+ def extract_artists_as_recorded_agr record
116
+ extract_artists(record).map &:vernacular
117
+ end
118
+
119
+ # Extract former owners from the given record.
120
+ #
121
+ # @param [Nokogiri::XML:Node] record the record to extract former owners from
122
+ # @return [Array<DS::Extractor::Name>] the extracted former owners
123
+ def extract_former_owners record
124
+ extract_names(
125
+ record, tags: [700, 710, 711], relators: ['former owner']
126
+ )
127
+ end
128
+
129
+ # Extracts former owners as recorded from the given record.
130
+ #
131
+ # @param [Nokogiri::XML:Node] record the record to extract former owners from
132
+ # @return [Array<String>] the extracted former owners as recorded
133
+ def extract_former_owners_as_recorded record
134
+ extract_former_owners(record).map &:as_recorded
135
+ end
136
+
137
+ # Extracts former owners as recorded with vernacular form from the given record.
138
+ #
139
+ # @param [Nokogiri::XML:Node] record the record to extract former owners from
140
+ # @return [Array<String>] the extracted former owners as recorded with vernacular form
141
+ def extract_former_owners_as_recorded_agr record
142
+ extract_former_owners(record).map &:vernacular
143
+ end
144
+
145
+ # Extracts scribes as recorded with vernacular form from the given record.
146
+ #
147
+ # @param [Nokogiri::XML:Node] record the record to extract scribes from
148
+ # @return [Array<String>] the extracted scribes as recorded with vernacular form
149
+ def extract_scribes_as_recorded_agr record
150
+ extract_scribes(record).map &:vernacular
151
+ end
152
+
153
+ # Extracts artists from the given record using the specified type and role.
154
+ #
155
+ # @param [Nokogiri::XML:Node] record the record to extract artists from
156
+ # @return [Array<DS::Extractor::Name>] an array of extracted artists
157
+ def extract_artists record
158
+ extract_names(
159
+ record, tags: [700, 710, 711],
160
+ relators: ['artist', 'illuminator']
161
+ )
162
+ end
163
+
164
+ # Extracts authors from the given record.
165
+ #
166
+ # @param [Nokogiri::XML:Node] record the record to extract authors from
167
+ # @return [Array<String>] an array of extracted authors
168
+ def extract_authors record
169
+ extract_names(record, tags: [100, 110, 111]) +
170
+ extract_names(record, tags: [700, 710, 711], relators: %w{author})
171
+ end
172
+
173
+ def extract_associated_agents record
174
+ []
175
+ end
176
+
177
+ ##
178
+ # For the given record, extract the names as an array of arrays, including
179
+ # the concatenated name string (subfields, a, b, c, d) and, if present,
180
+ # the alternate graphical representation (AGR) and authority number (or
181
+ # URI).
182
+ #
183
+ # Each returned sub array will have three values: name, name AGR, URI.
184
+ #
185
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
186
+ # @param [Array<String>] tags the MARC field tag[s]
187
+ # @param [Array<String>] relators for +700$e+, +710$e+, a value[s] like 'former owner'
188
+ # @return [Array<Array<String>>]
189
+ def extract_recon_names record, tags: [], relators: []
190
+ extract_names(record, tags: tags, relators: relators).map &:to_a
191
+ end
192
+
193
+ # Extract names from the MARC XML record based on specified tags and relators.
194
+ #
195
+ # @param [Nokogiri::XML:Node] record the record to extract names from
196
+ # @param [Array<String>] tags the MARC field tag[s]
197
+ # @param [Array<String>] relators for +700$e+, +710$e+, values like 'former owner'
198
+ # @return [Array<DS::Extractor::Name>] an array of extracted names
199
+ def extract_names record, tags: [], relators: []
200
+ xpath = build_name_query tags: tags, relators: relators
201
+ return [] if xpath.empty? # don't process nonsensical requests
202
+
203
+ record.xpath(xpath).map { |datafield|
204
+
205
+ as_recorded = extract_name_portion datafield
206
+ role = extract_role datafield, relators: relators
207
+ role = 'author' if role.blank?
208
+ vernacular = extract_pn_agr datafield
209
+ ref = extract_authority_number datafield
210
+
211
+ DS::Extractor::Name.new(
212
+ as_recorded: as_recorded, role: role,
213
+ vernacular: vernacular, ref: ref
214
+ )
215
+ }
216
+ end
217
+
218
+ ##
219
+ # Extract the alternate graphical representation of the name or return +''+.
220
+ #
221
+ # See MARC specification for 880 fields:
222
+ #
223
+ # * https://www.loc.gov/marc/bibliographic/bd880.html
224
+ #
225
+ # @see #build_name_query for details on query construction
226
+ #
227
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
228
+ # @param [Array<String>] tags the MARC field code[s]
229
+ # @param [Array<String>] relators for +700$e+, +710$e+, a value[s] like 'former owner'
230
+ def extract_names_as_recorded_agr record, tags: [], relators: []
231
+ xpath = build_name_query tags: tags, relators: relators
232
+ return '' if xpath.empty? # don't process nonsensical requests
233
+
234
+ record.xpath(xpath).map { |datafield|
235
+ extract_pn_agr datafield
236
+ }
237
+ end
238
+
239
+ ##
240
+ # Build names query tags and relators. Tags understood are +100+, +700+,
241
+ # and +710+. The +relators+ are used to require datafields based on the contents
242
+ # of a subfield code +e+ containing the specified value, like 'scribe':
243
+ #
244
+ # contains(./subfield[@code ='e'], 'scribe')
245
+ #
246
+ # For relators see section <strong>$e - Relator term<strong>, here:
247
+ #
248
+ # https://www.loc.gov/marc/bibliographic/bdx00.html
249
+ #
250
+ # To require the subfield not have a relator, pass +:none+ as the relator value.
251
+ #
252
+ # build_name_query tags: ['100'], relators: :none
253
+ #
254
+ # This will add the following to the query.
255
+ #
256
+ # not(./subfield[@code = 'e'])
257
+ #
258
+ # Note: In U. Penn manuscript catalog records, 700 and 710 fields that *do*
259
+ # *not* have a subfield code +e+ are associated authors.
260
+ #
261
+ # @param [Array<String>] tags the MARC field code[s]
262
+ # @param [Array<String>] relators for +700$e+, +710$e+, a value[s] like 'former owner'
263
+ # @return [String] the data field query string
264
+ def build_name_query tags: [], relators: []
265
+ return '' if tags.empty? # don't process nonsensical requests
266
+ # make sure the tags are all strings
267
+ _tags = [tags].flatten.map &:to_s
268
+ tag_query = _tags.map { |t| "@tag = #{t}" }.join " or "
269
+ query_string = "(#{tag_query})"
270
+
271
+ _relators = [relators].flatten.map { |r| r.to_s.strip.downcase == 'none' ? :none : r }
272
+ return "datafield[#{query_string}]" if _relators.empty?
273
+
274
+ if _relators.include? :none
275
+ query_string += " and not(./subfield[@code = 'e'])"
276
+ return "datafield[#{query_string}]"
277
+ end
278
+
279
+ relator_string = relators.map { |r| "contains(./subfield[@code ='e'], '#{r}')" }.join " or "
280
+ query_string += (relator_string.empty? ? '' : " and (#{relator_string})")
281
+ "datafield[#{query_string}]"
282
+ end
283
+
284
+ ###
285
+ # Extract the the PN from datafield, pulling subfields $a, $b, $c, $d.
286
+ #
287
+ # @param [Nokogiri::XML::Node] datafield the +marc:datafield+ node with the name
288
+ # @return [String]
289
+ def extract_name_portion datafield
290
+ codes = %w{ a b c d }
291
+ value = collect_subfields datafield, codes: codes
292
+ DS::Util.clean_string value, terminator: ''
293
+ end
294
+
295
+ ###
296
+ # Extract the role value, subfield +$e+, from the given datafield.
297
+ #
298
+ # @param [Nokogiri::XML::Node] datafield the +marc:datafield+ node with the name
299
+ # @return [String]
300
+ def extract_role datafield, relators:
301
+ relators_list = *relators
302
+ return '' if relators_list.empty? or relators_list.include? :none
303
+ # if there's no $e, stop processing
304
+ return '' if datafield.xpath('subfield[@code = "e"]/text()').text.empty?
305
+
306
+ df_roles = datafield.xpath('subfield[@code = "e"]/text()').map(&:text)
307
+ rel_pattern = /(#{relators_list.join('|')})/
308
+ role = df_roles.find { |role| role =~ rel_pattern }
309
+ DS::Util.clean_string role, terminator: ''
310
+ end
311
+
312
+ #########################################################################
313
+ # Miscellaneous authority values
314
+ #########################################################################
315
+
316
+ ###
317
+ # Extract the language codes from controlfield 008 and datafield 041$a.
318
+ #
319
+ # @param [Nokogiri::XML::Node] record the marc:record node
320
+ # @return [String]
321
+ def extract_langs record
322
+ # Language is in 008 at characters 35-37 (0-based indexing)
323
+ (langs ||= []) << record.xpath("substring(controlfield[@tag='008']/text(), 36, 3)")
324
+ # 041 is present if there's more than one language
325
+ langs += record.xpath("datafield[@tag=041]/subfield[@code='a']").map(&:text)
326
+ # if there are 041 values, the lang from 008 is repeated; remove the duplicate
327
+ langs.select(&:present?).uniq
328
+ end
329
+
330
+ ##
331
+ # Extract the language as record; default to the 546$a field; otheriwse
332
+ # return the code values from controlfield 008 and 041$a.
333
+ #
334
+ # @param [Nokogiri::XML::Node] record the marc:record node
335
+ # @return [String]
336
+ def extract_languages_as_recorded record
337
+ extract_languages(record).map &:as_recorded
338
+ end
339
+
340
+ def extract_languages record
341
+ xpath = "datafield[@tag=546]/subfield[@code='a']"
342
+ langs = record.xpath(xpath).map { |val|
343
+ DS::Util.clean_string val.text, terminator: ''
344
+ }.select(&:present?).map { |as_recorded|
345
+ DS::Extractor::Language.new as_recorded: as_recorded
346
+ }
347
+ return langs if langs.present?
348
+
349
+ extract_langs(record).map { |as_recorded|
350
+ DS::Extractor::Language.new as_recorded: as_recorded
351
+ }
352
+ end
353
+
354
+ #########################################################################
355
+ # Genres and subjects
356
+ #########################################################################
357
+ ##
358
+ # Extract genre and form terms from MARC datafield 655 values, where the
359
+ # 655$2 value can be specified; e.g., +rbprov+, +aat+, +lcgft+.
360
+ #
361
+ # Set +sub2+ to +:all+ to extract all 655 terms
362
+ #
363
+ # @param [Nokogiri::XML::Node] record the MARC record
364
+ # @param [Boolean] uniq whether to return only unique terms; default: +true+
365
+ # @return [Array<String>] array of genre terms
366
+ def extract_genres_as_recorded record, uniq: true
367
+ terms = extract_genres(record, sub_sep: '--', vocab: :all).map(&:as_recorded)
368
+
369
+ uniq ? terms.uniq : terms
370
+ end
371
+
372
+ ##
373
+ # Return an array of strings of formatted subjects (600, 610, 611, 630,
374
+ # 647, 648, 650, and 651). Subjects values are separated by '--':
375
+ #
376
+ # <datafield ind1="1" ind2="0" tag="600">
377
+ # <subfield code="a">Cicero, Marcus Tullius</subfield>
378
+ # <subfield code="x">Spurious and doubtful works.</subfield>
379
+ # </datafield>
380
+ #
381
+ # # => "Cicero, Marcus Tullius--Spurious and doubtful works"
382
+ #
383
+ # Subfields with codes 'b', 'c', 'd', 'p', 'q', and 't' are appended to
384
+ # the preceding subfield:
385
+ #
386
+ # <datafield ind1=" " ind2="7" tag="647">
387
+ # <subfield code="a">Conspiracy of Catiline</subfield>
388
+ # <subfield code="c">(Rome :</subfield>
389
+ # <subfield code="d">65-62 B.C.)</subfield>
390
+ # <subfield code="2">fast</subfield>
391
+ # <subfield code="0">(OCoLC)fst01352536</subfield>
392
+ # </datafield>
393
+ #
394
+ # # => "Conspiracy of Catiline (Rome : 65-62 B.C.)"
395
+ #
396
+ # @param [Nokogiri::XML::Node] record the MARC record
397
+ # @return [Array<DS::Extractor::Subject>] an array of formatted subjects strings
398
+ def extract_subject_by_tags record, tags: []
399
+ tag_list = *tags
400
+ raise "No tags given for subject extraction: #{tags.inspect}" if tag_list.empty?
401
+ sep = '--'
402
+ tag_query = tag_list.map { |tag| "@tag=#{tag}" }.join " or "
403
+ record.xpath("datafield[#{tag_query}]").map { |datafield|
404
+ values = Hash.new { |hash, k| hash[k] = [] }
405
+ vocab = datafield.xpath('./@ind2').text
406
+ datafield.xpath("subfield").map { |subfield|
407
+ subfield_text = DS::Util.clean_string subfield.text
408
+ subfield_code = subfield.xpath('./@code').text
409
+ case subfield_code
410
+ when 'e', 'w'
411
+ # don't include these formatted in subject
412
+ when 'b', 'c', 'd', 'p', 'q', 't'
413
+ # append these to the preceding value
414
+ # we assume that there is a preceding value
415
+ values[:terms][-1] += " #{subfield_text}"
416
+ values[:codes][-1] += ";#{subfield_code}"
417
+ when %r{\A[[:alpha:]]\z}
418
+ # any other codes: a, g, v, x, y, z
419
+ values[:terms] << subfield_text
420
+ values[:codes] << subfield_code
421
+ when '2'
422
+ vocab = subfield.text
423
+ when '0'
424
+ values[:urls] << subfield_text
425
+ end
426
+ }
427
+ terms = DS::Util.clean_string values[:terms].join(sep), terminator: ''
428
+ urls = DS::Util.clean_string values[:urls].join(sep), terminator: ''
429
+ codes = DS::Util.clean_string values[:codes].join(sep), terminator: ''
430
+ DS::Extractor::Subject.new(
431
+ as_recorded: terms,
432
+ subfield_codes: codes,
433
+ source_authority_uri: urls,
434
+ vocab: vocab
435
+ )
436
+ }
437
+
438
+ end
439
+
440
+ # Extracts named subjects as recorded from the given record.
441
+ #
442
+ # @param [Nokogiri::XML:Node] record the record to extract named subjects from
443
+ # @return [Array<String>] the extracted named subjects as recorded
444
+ def extract_named_subjects_as_recorded record
445
+ extract_named_subjects(record).map &:as_recorded
446
+ end
447
+
448
+
449
+ # Extract named subjects from the MARC XML record based on specified tags.
450
+ #
451
+ # @param [Nokogiri::XML:Node] record the record to extract named subjects from
452
+ # @return [Array<DS::Extractor::Subject>] an array of extracted named subjects
453
+ def extract_named_subjects record
454
+ extract_subject_by_tags record, tags: [600, 610, 611, 630, 647]
455
+ end
456
+
457
+ # Extracts subjects as recorded from the given record.
458
+ #
459
+ # @param [Nokogiri::XML:Node] record the record to extract subjects from
460
+ # @return [Array<String>] the extracted subjects as recorded
461
+ def extract_subjects_as_recorded record
462
+ extract_subjects(record).map &:as_recorded
463
+ end
464
+
465
+ # Extracts subjects from the given record based on specified tags.
466
+ #
467
+ # @param [Nokogiri::XML:Node] record the record to extract subjects from
468
+ # @return [Array<DS::Extractor::Subject>] an array of extracted subjects
469
+ def extract_subjects record
470
+ extract_subject_by_tags record, tags: [648, 650, 651]
471
+ end
472
+
473
+ # Extracts all subjects from the given record, including named subjects and subjects.
474
+ #
475
+ # @param [Nokogiri::XML:Node] record the record to extract all subjects from
476
+ # @return [Array<DS::Extractor::Subject>] the extracted all subjects
477
+ def extract_all_subjects record
478
+ extract_named_subjects(record) + extract_subjects(record)
479
+ end
480
+
481
+ # Extracts all subjects as recorded from the given record.
482
+ #
483
+ # @param [Nokogiri::XML:Node] record the record to extract all subjects from
484
+ # @return [Array<String>] the extracted all subjects as recorded
485
+ def extract_all_subjects_as_recorded record
486
+ extract_all_subjects(record).map &:as_recorded
487
+ end
488
+
489
+
490
+ ##
491
+ # Extract genre terms for reconciliation CSV output.
492
+ #
493
+ # Returns a two-dimensional array, each row is a place; and each row has
494
+ # three columns: term, vocab, and authority number.
495
+ #
496
+ # @param [Nokogiri::XML:Node] record a +<MARC_RECORD>+ node
497
+ # @return [Array<Array>] an array of arrays of values
498
+ def extract_recon_genres record, sub_sep: '--'
499
+ extract_genres(record, sub_sep: sub_sep).map(&:to_a)
500
+ end
501
+
502
+ # Extracts genres from the given MARC XML record.
503
+ #
504
+ # @param [Nokogiri::XML:Node] record the MARC XML record to extract genres from
505
+ # @param [String] sub_sep (default: '--') the separator for joining subfields
506
+ # @param [Symbol, String] vocab (default: :all) the vocab type to extract
507
+ # @return [Array<DS::Extractor::Genre>] an array of extracted genres
508
+ def extract_genres record, sub_sep: '--', vocab: :all
509
+ xpath = %q{datafield[@tag = 655]}
510
+ record.xpath(xpath).filter_map { |datafield|
511
+ as_recorded = collect_subfields datafield, codes: 'abcvzyx'.split(//), sub_sep: sub_sep
512
+ as_recorded = DS::Util.clean_string as_recorded, terminator: ''
513
+ term_vocab = extract_vocabulary datafield
514
+ source_authority_uri = extract_authority_number datafield
515
+
516
+ next unless as_recorded.present?
517
+ next unless vocab == :all || vocab == term_vocab
518
+
519
+ DS::Extractor::Genre.new(
520
+ as_recorded: as_recorded, vocab: term_vocab,
521
+ source_authority_uri: source_authority_uri
522
+ )
523
+ }
524
+ end
525
+
526
+ # Extracts the genre vocabulary from the given MARC XML record.
527
+ #
528
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract genre vocabulary from
529
+ # @return [Array<Symbol>] an array of extracted genre vocabularies
530
+ def extract_genre_vocabulary record
531
+ extract_genres(record).map(&:vocab)
532
+ end
533
+
534
+ # Extracts reconstructed subjects from the given record.
535
+ #
536
+ # @param [Nokogiri::XML::Node] record the record to extract reconstructed subjects from
537
+ # @return [Array] the extracted reconstructed subjects
538
+ def extract_recon_subjects record
539
+ extract_all_subjects(record).map &:to_a
540
+ end
541
+
542
+ #########################################################################
543
+ # Place of production
544
+ #########################################################################
545
+
546
+ ##
547
+ # Look for a place as recorded. Look first at 264$a, then 260$a; return ''
548
+ # when no value is found
549
+ # @param [Nokogiri::XML::Node] record the MARC record
550
+ # @return [Array<String>] the place name or []
551
+ def extract_production_places_as_recorded record
552
+ xpath = "datafield[@tag=260 or @tag=264]/subfield[@code='a']/text()"
553
+ record.xpath(xpath).map { |pn|
554
+ DS::Util.clean_string pn.text, terminator: '' unless pn.to_s.strip.empty?
555
+ }
556
+ end
557
+
558
+ ##
559
+ # Extract the places of production MARC +260$a+ for reconciliation CSV
560
+ # output.
561
+ #
562
+ # Returns a two-dimensional array, each row is a place; and each row has
563
+ # one column: place name; for example:
564
+ #
565
+ # [["Austria"],
566
+ # ["Germany"],
567
+ # ["France (?)"]]
568
+ #
569
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
570
+ # @return [Array<Array>] an array of arrays of values
571
+ def extract_recon_places record
572
+ extract_places(record).map &:to_a
573
+ end
574
+
575
+ def extract_places record
576
+ xpath = "datafield[@tag=260 or @tag=264]/subfield[@code='a']/text()"
577
+ record.xpath(xpath).map { |pn|
578
+ next if pn.to_s.blank?
579
+ as_recorded = DS::Util.clean_string(pn.text, terminator: '')
580
+ DS::Extractor::Place.new as_recorded: as_recorded
581
+ }
582
+ end
583
+
584
+ #########################################################################
585
+ # Date of production
586
+ #########################################################################
587
+
588
+ ###
589
+ # Extract the encoded date from controlfield 008.
590
+ #
591
+ # Follows
592
+ #
593
+ # - https://www.loc.gov/marc/bibliographic/bd046.html
594
+ # - https://www.loc.gov/marc/bibliographic/bd046.html
595
+ #
596
+ # Returns an array containing a pair of dates or a single date,
597
+ # or an empty array.
598
+ #
599
+ # The following date types have appeared in MARC records
600
+ # contributed to DS as of 2024-02-27 and are handled here:
601
+ #
602
+ # b - No dates given; B.C. date involved
603
+ # - 'b '
604
+ # - date is taken from 046$b, and if present $d or $e
605
+ # - See: https://www.loc.gov/marc/bibliographic/bd046.html
606
+ #
607
+ #
608
+ # e - Detailed date
609
+ # - 'e11200520', 'e139403 x', 'e164509 t', 'e167707 y',
610
+ # 'e187505 s'
611
+ # - the first date part is returned a single year
612
+ #
613
+ # i - Inclusive dates of collection
614
+ # - 'i07500800', 'i08000830', 'i1000 '
615
+ # - the first and -- if present -- second date part are
616
+ # returned as two years
617
+ #
618
+ # k - Range of years of bulk of collection
619
+ # - 'k15121716'
620
+ # - the first and second date parts are returned as two years
621
+ #
622
+ # m - Multiple dates
623
+ # - 'm0618193u', 'm07390741', 'm10751200', 'm16uu1637',
624
+ # 'm17uu1900'
625
+ # - the first and second date parts are returned as two years
626
+ # - see note below on replacement of u's
627
+ #
628
+ # n - Dates unknown
629
+ # - 'nuuuuuuuu'
630
+ # - no date returned
631
+ #
632
+ # p - Date of distribution/release/issue and
633
+ # production/recording session when different
634
+ # - 'p1400 '
635
+ # - the first and -- if present -- second date part are
636
+ # returned as two years
637
+ #
638
+ # q - Questionable date
639
+ # q - 'q01000299', 'q0979 ', 'q09910992', 'q10001099',
640
+ # 'q1300 ', 'q13uu14uu', 'q13uu1693', 'q14011425',
641
+ # 'q1425uuuu', 'q1450 ', 'q1460 ', 'q14uu14uu',
642
+ # 'quuuu1597'
643
+ # - the first and -- if present -- second date part are
644
+ # returned as two years
645
+ # - if the second date part is 'uuuu', the first date part is
646
+ # returned as year; ; ‘q1425uuuu’ => 1425
647
+ # - if the first date part is 'uuuu', the second date part is
648
+ # returned as year; ‘quuuu1597’ => 1597
649
+ # - for partial date parts with u's, see the note below
650
+ #
651
+ # r - Reprint/reissue date and original date
652
+ # - 'r11751199'
653
+ # - the first date part is returned a single year
654
+ #
655
+ # s - Single known date/probable date
656
+ # s - 's1171 ', 's1171 xx ', 's1192 ua ', 's1250||||',
657
+ # 's1286 iq ', 's1315 sy ', 's1366 is ', 's1436 gw ',
658
+ # 's1450 it ', 's1470 ly ', 's1470 tu ', 's1470 uuu',
659
+ # 's1497 enk', 's1595 sp ', 's19uu '
660
+ # - the first date part is returned a single year
661
+ # - see note below on replacement of u's
662
+ #
663
+ # | - No attempt to code
664
+ # - '|12501300'
665
+ # - this appears to be miscoding
666
+ # - nevertheless, '|' coded records will follow the default
667
+ # rule: date part one is returned a single year
668
+ #
669
+ # The following cases, so far unrepresented in contributor data,
670
+ # will follow the default rule: date part one will be returned
671
+ # as a single year.
672
+ #
673
+ # c - Continuing resource currently published
674
+ # d - Continuing resource ceased publication
675
+ # t - Publication date and copyright date
676
+ # u - Continuing resource status unknown
677
+ #
678
+ # Note on the replacement of u's in partial year dates
679
+ #
680
+ # - Where u's appear in the first date they are replace by 0;
681
+ # thus, 'q13uu1693' => '1300, 1693'
682
+ # - Where u's appear in the second date they are replace by 9;
683
+ # thus, 'q14uu14uu' => '1400, 1499'
684
+ #
685
+ # For MARC partial dates see Date 1 and Date 2 documentation
686
+ # here
687
+ #
688
+ # https://www.loc.gov/marc/bibliographic/bd008a.html
689
+ #
690
+ # @param [Nokogiri::XML::Node] record the +marc:record+ node
691
+ # @return [Array]
692
+ def extract_date_range record, range_sep:
693
+ # 008 controlfield; e.g.,
694
+ #
695
+ # "220518q14001500xx 000 0 d"
696
+ ctrl_008 = record.at_xpath("controlfield[@tag='008']")
697
+ return [] unless ctrl_008 # return if no 008
698
+ # get positions 7-15: q14001500
699
+ date_str = ctrl_008.text[6, 9]
700
+ code = date_str[0] # 'm'
701
+ part1 = extract_date_part date_str, 1, 4 # '0618'
702
+ part1.gsub! /u/, '0' if part1.present?
703
+ part2 = extract_date_part date_str, 5, 8 # '193u'
704
+ part2.gsub! /u/, '9' if part2.present?
705
+
706
+ range = compile_dates(record, code, part1, part2).filter_map { |y|
707
+ # filter out blank dates and '9999'
708
+ y if y.present? && y != '9999'
709
+ }
710
+
711
+ return [] if range.blank?
712
+ [range.join(range_sep)]
713
+ end
714
+
715
+ # Compiles dates based on the provided code and parts. This
716
+ # methods determines the date based on the date code from the
717
+ # MARC 008 field; the code in position 6 of the MARC 008 field.
718
+ #
719
+ # See https://www.loc.gov/marc/bibliographic/bd008.html
720
+ #
721
+ # @param [Nokogiri::XML::Node] record the +marc:record+ node
722
+ # @param [String] code the marc 008 date code
723
+ def compile_dates record, code, part1, part2
724
+ case code
725
+ when 'i', 'k', 'm', 'p', 'q', '|'
726
+ [part1, part2]
727
+ when 'n'
728
+ []
729
+ when 'b'
730
+ handle_bce_date record
731
+ else
732
+ [part1]
733
+ end
734
+ end
735
+
736
+ # Compiles BCE dates based on the provided record. It extracts
737
+ # BCE dates from specific subfields in the MARC XML record.
738
+ #
739
+ # The method stops and returns an empty array +[]+ if the
740
+ # record lacks a 240$b (BCE date 1). It then looks for a 245$d
741
+ # (BCE date 2) or 245$e (CE date 2). An array containing the
742
+ # single 240$b value as a negative value or a range of two
743
+ # dates.
744
+ #
745
+ # See: https://www.loc.gov/marc/bibliographic/bd046.html
746
+ #
747
+ # @param [Nokogiri::XML::Node] record the MARC XML record
748
+ # @return [Array<String>] an array of BCE dates in string format
749
+ def handle_bce_date record
750
+ # "datafield[@tag=260]/subfield[@code='c' or @code='d']/text()")
751
+ bce_date1 = record.at_xpath('datafield[@tag=046]/subfield[@code="b"]/text()').to_s
752
+ # stop if there's no BCE date 1
753
+ return [] if bce_date1.blank?
754
+
755
+ xpath = 'datafield[@tag=046]/subfield[@code="d"]/text()'
756
+ bce_date2 = record.at_xpath(xpath).to_s
757
+
758
+ return ["-#{bce_date1}", "-#{bce_date2}"] if bce_date2.present?
759
+
760
+ xpath = 'datafield[@tag=046]/subfield[@code="e"]/text()'
761
+ ce_date2 = bce_date2 = record.at_xpath(xpath).to_s
762
+ return ["-#{bce_date1}", ce_date2] if ce_date2.present?
763
+
764
+ ["-#{bce_date1}"]
765
+ end
766
+
767
+ # Extracts a part of the date string from a MARC 008
768
+ # controlfield, using the indices ndx1 and ndx2.
769
+ #
770
+ # Ensures that the extracted part starts with a digit and
771
+ # matches a sequence of digits and/or 'u'.
772
+ #
773
+ # @param [String] datestring the input datestring
774
+ # @param [Integer] ndx1 the starting index for extraction
775
+ # @param [Integer] ndx2 the length of the substring to extract
776
+ # @return [String] the extracted part of the datestring
777
+ def extract_date_part datestring, ndx1, ndx2
778
+ part = datestring[ndx1, ndx2]
779
+
780
+ # part must start with a digit and match a seq of digits and/or u
781
+ return unless part =~ /^\d[\du]+/
782
+
783
+ part.sub! /^0+/, '' if part =~ /^0+[1-9]/
784
+ part
785
+ end
786
+
787
+ ##
788
+ # Look for a date as recorded. Look first at 260$c, then 260$d, then
789
+ # 245$f, finally use the encoded date from 008
790
+ def extract_production_date_as_recorded record
791
+ # Note that MARC does not specify a subfield '260$d':
792
+ #
793
+ # https://www.loc.gov/marc/bibliographic/bd260.html
794
+ #
795
+ # However Cornell use $d to continue 260$c
796
+ dar = record.xpath("datafield[@tag=260]/subfield[@code='c' or @code='d']/text()").map do |t|
797
+ DS::Util.clean_string t.text.strip
798
+ end.join ' '
799
+ return [dar.strip] unless dar.strip.empty?
800
+
801
+ dar = record.xpath("datafield[@tag=264]/subfield[@code='c']/text()").map do |t|
802
+ DS::Util.clean_string t.text.strip
803
+ end.join ' '
804
+ return [dar.strip] unless dar.strip.empty?
805
+
806
+ # 245 is the title field but can have a date in $f
807
+ #
808
+ # see: https://www.loc.gov/marc/bibliographic/bd245.html
809
+ #
810
+ # Cornell uses 245$f in records that also lack 260 or 264; see
811
+ # '4600 Bd. Ms. 176':
812
+ #
813
+ # https://catalog.library.cornell.edu/catalog/6382455/librarian_view
814
+ #
815
+ # <datafield ind1="0" ind2="0" tag="245">
816
+ # <subfield code="a">Shah-nameh,</subfield>
817
+ # <subfield code="f">1600s.</subfield>
818
+ # </datafield>
819
+ #
820
+ dar = record.xpath("datafield[@tag=245]/subfield[@code='f']").text
821
+ return [DS::Util.clean_string(dar)] unless dar.strip.empty?
822
+
823
+ encoded_date = extract_date_range record, range_sep: '-'
824
+ [encoded_date.join('_').strip]
825
+ end
826
+
827
+ #########################################################################
828
+ # Titles
829
+ #########################################################################
830
+
831
+ # Extracts reconstructed titles from the given record.
832
+ #
833
+ # @param [Nokogiri::XML:Node] record the record to extract reconstructed titles from
834
+ # @return [Array<String>] the extracted reconstructed titles
835
+ def extract_recon_titles record
836
+ extract_titles(record).to_a
837
+ end
838
+
839
+ # Extracts titles from the given record.
840
+ #
841
+ # @param [Nokogiri::XML:Node] record the record to extract titles from
842
+ # @return [Array<DS::Extractor::Title>] an array of extracted titles
843
+ def extract_titles record
844
+ tar = title_as_recorded record
845
+ tar_agr = DS::Util.clean_string DS::Extractor::MarcXmlExtractor.title_as_recorded_agr(record, 245), terminator: ''
846
+ utar = DS::Util.clean_string DS::Extractor::MarcXmlExtractor.uniform_titles_as_recorded(record), terminator: ''
847
+ utar_agr = DS::Util.clean_string DS::Extractor::MarcXmlExtractor.uniform_title_as_recorded_agr(record), terminator: ''
848
+
849
+ [DS::Extractor::Title.new(
850
+ as_recorded: tar,
851
+ vernacular: tar_agr,
852
+ uniform_title: utar,
853
+ uniform_title_vernacular: utar_agr
854
+ )]
855
+ end
856
+
857
+ # Extracts titles as recorded with vernacular form from the given record.
858
+ #
859
+ # @param [Nokogiri::XML:Node] record the record to extract titles from
860
+ # @return [Array<String>] the extracted titles as recorded with vernacular form
861
+ def extract_titles_as_recorded_agr record
862
+ extract_titles(record).map &:vernacular
863
+ end
864
+
865
+ # Extracts the title as recorded from the given record.
866
+ #
867
+ # @param [Nokogiri::XML:Node] record the record to extract the title from
868
+ # @return [String] the extracted title as recorded
869
+ def title_as_recorded record
870
+ xpath = "datafield[@tag=245]/subfield[@code='a' or @code='b']"
871
+ record.xpath(xpath).map { |title|
872
+ DS::Util.clean_string(title.text, terminator: '')
873
+ }.join '; '
874
+ end
875
+
876
+ # Extracts the title as recorded with vernacular form from the given record.
877
+ #
878
+ # @param [Nokogiri::XML::Node] record the record to extract the title from
879
+ # @param [Integer] tag the tag to use for extraction
880
+ # @return [String] the extracted title as recorded with vernacular form
881
+ def title_as_recorded_agr record, tag
882
+ linkage = record.xpath("datafield[@tag=#{tag}]/subfield[@code='6']").text
883
+ return '' if linkage.empty?
884
+ index = linkage.split('-').last
885
+ xpath = "datafield[@tag='880' and contains(./subfield[@code='6'], '#{tag}-#{index}')]/subfield[@code='a']"
886
+ DS::Util.clean_string record.xpath(xpath).text.delete '[]'
887
+ end
888
+
889
+ # Extracts titles as recorded from the given record.
890
+ #
891
+ # @param record [Nokogiri::XML:Node] the record to extract titles from
892
+ # @return [Array<String>] the extracted titles as recorded
893
+ def extract_titles_as_recorded record
894
+ extract_titles(record).map &:as_recorded
895
+ end
896
+
897
+ # Extracts uniform titles as recorded from the given record.
898
+ #
899
+ # @param [Nokogiri::XML::Node] record the record to extract uniform titles from
900
+ # @return [String] the extracted uniform titles as recorded joined by '|'
901
+ def uniform_titles_as_recorded record
902
+ title_240 = record.xpath("datafield[@tag=240]/subfield[@code='a']").text
903
+ title_130 = record.xpath("datafield[@tag=130]/subfield[@code='a']").text
904
+ [title_240, title_130].reject(&:empty?).map { |title|
905
+ DS::Util.clean_string title, terminator: ''
906
+ }.join '|'
907
+ end
908
+
909
+ # Extracts uniform titles as recorded from the given record.
910
+ #
911
+ # @param [Nokogiri::XML:Node] record the record to extract uniform titles from
912
+ # @return [Array<String>] the extracted uniform titles as recorded
913
+ def extract_uniform_titles_as_recorded record
914
+ extract_titles(record).map &:uniform_title
915
+ end
916
+
917
+
918
+ # Extracts uniform titles as recorded with vernacular form from the given MARC XML record.
919
+ #
920
+ # @param [Nokogiri::XML:Node] record the record to extract uniform titles from
921
+ # @return [Array<String>] the extracted uniform titles as recorded with vernacular form
922
+ def extract_uniform_titles_as_recorded_agr record
923
+ extract_titles(record).map &:uniform_title_vernacular
924
+ end
925
+
926
+ # Extracts uniform titles as recorded and aggregates them from the given MARC XML record.
927
+ #
928
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract uniform titles from
929
+ # @return [String] the aggregated uniform titles as recorded
930
+ def uniform_title_as_recorded_agr record
931
+ tag240 = title_as_recorded_agr record, 240
932
+ tag130 = title_as_recorded_agr record, 130
933
+ [tag240, tag130].reject(&:empty?).map { |title|
934
+ DS::Util.clean_string title
935
+ }.join '|'
936
+ end
937
+
938
+ #########################################################################
939
+ # Physical description
940
+ #########################################################################
941
+
942
+ # Extracts the physical description from the given MARC XML record.
943
+ #
944
+ # @param [Nokogiri::XML:Node] record the record to extract the physical description from
945
+ # @return [String] the extracted physical description
946
+ def extract_physical_description record
947
+ extract_extent(record)
948
+ end
949
+
950
+ # Extracts the material as recorded from the given MARC XML record.
951
+ #
952
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract material from
953
+ # @return [String] the extracted material as recorded
954
+ def extract_material_as_recorded record
955
+ extract_materials(record).map(&:as_recorded).first.to_s
956
+ end
957
+
958
+ # Extracts materials from the given MARC XML record.
959
+ #
960
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract materials from
961
+ # @return [Array<DS::Extractor::Material>] an array of extracted materials
962
+ def extract_materials record
963
+ DS::Extractor::MarcXmlExtractor.collect_datafields(
964
+ record, tags: 300, codes: 'b'
965
+ ).filter_map { |material|
966
+ next unless material.present?
967
+ DS::Extractor::Material.new as_recorded: material
968
+ }
969
+ end
970
+
971
+ # Extracts the extent from the given MARC XML record.
972
+ #
973
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract extent from
974
+ # @return [Array<String>] an array of extracted extents
975
+ def extract_extent record
976
+ subfield_xpath = "subfield[@code = 'a' or @code = 'b' or @code = 'c']"
977
+ record.xpath("datafield[@tag=300]").map { |datafield|
978
+ datafield.xpath(subfield_xpath).filter_map { |s|
979
+ s.text unless s.text.empty?
980
+ }.join ' '
981
+ }.filter_map { |ext|
982
+ "Extent: #{DS::Util.clean_string ext}" unless ext.strip.empty?
983
+ }
984
+ end
985
+
986
+ #########################################################################
987
+ # Notes
988
+ #########################################################################
989
+ ##
990
+ # Extract notes from +record+.
991
+ #
992
+ # Extract values from `500$a` fields that do not begin with AMREMM
993
+ # tags for specific values like 'Binding:'. Specifically, this method
994
+ # ignores fields beginning with:
995
+ #
996
+ # Pagination|Foliation|Layout|Colophon|Collation|Script|Decoration|\
997
+ # Binding|Origin|Watermarks|Watermark|Signatures|Shelfmark
998
+ #
999
+ # @param [Nokogiri::XML:Node] record a +<MARC_RECORD>+ node
1000
+ # @return [Array<String>] an array of note strings
1001
+ def extract_notes record
1002
+ xpath = "datafield[@tag=500 or @tag=561]/subfield[@code='a']/text()"
1003
+ record.xpath(xpath).map { |note|
1004
+ DS::Util.clean_string note.text.strip.gsub(%r{\s+}, ' ')
1005
+ }
1006
+ end
1007
+
1008
+ ###
1009
+ # Extract the authority number, subfield +$0+ from the given datafield.
1010
+ #
1011
+ # @param [Nokogiri::XML::Node] datafield the +marc:datafield+ node with the name
1012
+ # @return [String]
1013
+ def extract_authority_number datafield
1014
+ xpath = "./subfield[@code='0']"
1015
+ datafield.xpath(xpath).text
1016
+ end
1017
+
1018
+ ##
1019
+ # Extract the alternate graphical representation of the name or return +''+.
1020
+ #
1021
+ # See MARC specification for 880 fields:
1022
+ #
1023
+ # * https://www.loc.gov/marc/bibliographic/bd880.html
1024
+ #
1025
+ # Input will look like this:
1026
+ #
1027
+ # <marc:datafield ind1="1" ind2=" " tag="100">
1028
+ # <marc:subfield code="6">880-01</marc:subfield>
1029
+ # <marc:subfield code="a">Urmawī, ʻAbd al-Muʼmin ibn Yūsuf,</marc:subfield>
1030
+ # <marc:subfield code="d">approximately 1216-1294.</marc:subfield>
1031
+ # </marc:datafield>
1032
+ # <!-- ... -->
1033
+ # <marc:datafield ind1="1" ind2=" " tag="880">
1034
+ # <marc:subfield code="6">100-01//r</marc:subfield>
1035
+ # <marc:subfield code="a">ارموي، عبد المؤمن بن يوسف،</marc:subfield>
1036
+ # <marc:subfield code="d">اپرxمتلي 12161294.</marc:subfield>
1037
+ # </marc:datafield>
1038
+ #
1039
+ # @param [Nokogiri::XML::Node] datafield the main data field @tag = '100', '700', etc.
1040
+ # @return [String] the text representation of the value
1041
+ def extract_pn_agr datafield
1042
+ linkage = datafield.xpath("subfield[@code='6']").text
1043
+ return '' if linkage.empty?
1044
+ tag = datafield.xpath('./@tag').text
1045
+ index = linkage.split('-').last
1046
+ xpath = "./parent::record/datafield[@tag='880' and contains(./subfield[@code='6'], '#{tag}-#{index}')]"
1047
+ extract_name_portion datafield.xpath(xpath)
1048
+ end
1049
+
1050
+ def extract_cataloging_convention record
1051
+ record.xpath('datafield[@tag=040]/subfield[@code="e"]/text()').text
1052
+ end
1053
+
1054
+ ##
1055
+ # Extract datafields values with authority numbers (URL) when present
1056
+ # for reconciliation CSV output.
1057
+ #
1058
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
1059
+ # @param [Array<String>] tags the MARC datafield tag(s)
1060
+ # @param [Array<String>] codes the MARC subfield code(s)
1061
+ # @param [String] sub_sep separator for joining subfield values
1062
+ # @return [Array<Array>] an array of arrays of values
1063
+ def collect_recon_datafields record, tags: [], codes: [], sub_sep: ' '
1064
+ _tags = [tags].flatten.map &:to_s
1065
+ tag_query = _tags.map { |t| "@tag = #{t}" }.join " or "
1066
+ record.xpath("datafield[#{tag_query}]").map { |datafield|
1067
+ value = collect_subfields datafield, codes: codes, sub_sep: sub_sep
1068
+ value = DS::Util.clean_string value, terminator: ''
1069
+ number = datafield.xpath('subfield[@tag="0"]').text
1070
+ [value, number]
1071
+ }
1072
+ end
1073
+
1074
+ ##
1075
+ # Extract subfield values specified by +tags+
1076
+ #
1077
+ # @param [Nokogiri::XML:Node] record a +<marc:record>+ node
1078
+ # @param [Array<String>] tags the MARC datafield tag(s)
1079
+ # @param [Array<String>] codes the MARC subfield code(s)
1080
+ # @param [String] field_sep separator for joining multiple datafield values
1081
+ # @param [String] sub_sep separator for joining subfield values
1082
+ # @return [Array<Array>] an array of arrays of values
1083
+ def collect_datafields record, tags: [], codes: [], field_sep: '|', sub_sep: ' '
1084
+ _tags = [tags].flatten.map &:to_s
1085
+ tag_query = _tags.map { |t| "@tag = #{t}" }.join " or "
1086
+ record.xpath("datafield[#{tag_query}]").map { |datafield|
1087
+ value = collect_subfields datafield, codes: codes, sub_sep: sub_sep
1088
+ DS::Util.clean_string value, terminator: ''
1089
+ }
1090
+ end
1091
+
1092
+ ##
1093
+ # @param [Nokogiri::XML::Node] datafield the term datafield
1094
+ # @return [String]
1095
+ def extract_vocabulary datafield
1096
+ return 'lcsh' if datafield['ind2'] == '0'
1097
+
1098
+ vocab = datafield.xpath("subfield[@code=2]").text
1099
+ vocab.chomp '.' if vocab.present?
1100
+ end
1101
+
1102
+ # A method to collect subfields from a given datafield based on specified codes.
1103
+ #
1104
+ # @param [Nokogiri::XML::Node] datafield the datafield to collect subfields from
1105
+ # @param [Array<String>] codes the MARC subfield code(s) to collect
1106
+ # @param [String] sub_sep the separator for joining subfield values
1107
+ # @return [String] the concatenated subfield values
1108
+ def collect_subfields datafield, codes: [], sub_sep: ' '
1109
+ # ensure that +codes+ is an array of strings
1110
+ _codes = [codes].flatten.map &:to_s
1111
+ # Code query example: ['a', 'b', 'd', 'c'] => @code = 'a' or @code = 'b' or @code = 'c' or @code = 'd'
1112
+ code_query = _codes.map { |code| "@code = '#{code}'" }.join ' or '
1113
+ xpath = %Q{subfield[#{code_query}]}
1114
+ DS::Util.clean_string datafield.xpath(xpath).map(&:text).reject(&:empty?).join sub_sep
1115
+ end
1116
+
1117
+ # Extracts the 001 control number from the given MARC XML record and joins non-empty values with '|'.
1118
+ #
1119
+ # @param [Nokogiri::XML::Node] record the MARC XML record to extract the control number from
1120
+ # @param [Object] holdings_file (optional) the holdings file
1121
+ # @return [String] the extracted 001 control number joined with '|'
1122
+ def extract_001_control_number record, holdings_file = nil
1123
+ ids = []
1124
+ # add the MMS ID
1125
+ ids << extract_mmsid(record)
1126
+
1127
+ ids.reject(&:empty?).join '|'
1128
+ end
1129
+
1130
+ # Extracts the MMS ID from the given MARC XML record.
1131
+ #
1132
+ # @param record [Nokogiri::XML::Node] the MARC XML record to extract the MMS ID from
1133
+ # @return [String] the extracted MMS ID
1134
+ def extract_mmsid record
1135
+ record.xpath("controlfield[@tag=001]").text
1136
+ end
1137
+
1138
+ # Extracts acknowledgments from the given record.
1139
+ #
1140
+ # @param [Nokogiri::XML:Node] record the record to extract acknowledgments from
1141
+ # @return [Array] the extracted acknowledgments
1142
+ def extract_acknowledgments record
1143
+ []
1144
+ end
1145
+
1146
+ ##
1147
+ # Return an array of 500$a values that begin with +name:+ (+name+
1148
+ # followed by a colon +:+). The name prefix is removed if +strip_name+
1149
+ # is +true+; it's +false+ by default.
1150
+ #
1151
+ # @param [Nokogiri::XML::Node] record the MARC XML record
1152
+ # @param [String] name the named prefix, like 'Binding', *without* trailing colon
1153
+ # @param [Boolean] strip_name whether to remove the name prefix from
1154
+ # returned comments; default is +false+
1155
+ # @return [Array<String>] the matching
1156
+ def extract_named_500 record, name:, strip_name: false
1157
+ return [] if name.to_s.strip.empty?
1158
+
1159
+ # format the prefix; make sure there's not an extra ':'
1160
+ prefix = "#{name.strip.chomp ':'}:"
1161
+ xpath = %Q{datafield[@tag=500]/subfield[@code='a' and starts-with(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), '#{prefix.downcase}')]/text()}
1162
+ record.xpath(xpath).map { |d|
1163
+ note = d.text.strip
1164
+ strip_name ? note.sub(%r{^#{prefix}\s*}i, '') : note
1165
+ }
1166
+ end
1167
+ end
1168
+
1169
+ self.extend ClassMethods
1170
+ end
1171
+ end
1172
+ end