ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,1114 @@
1
+ require 'net/http'
2
+ require 'nokogiri'
3
+ require 'csv'
4
+
5
+ ##
6
+ # Module with class methods for working with DS10 METS XML.
7
+ module DS
8
+ module Extractor
9
+ module DsMetsXmlExtractor
10
+ module ClassMethods
11
+
12
+ NS = {
13
+ mods: 'http://www.loc.gov/mods/v3',
14
+ mets: 'http://www.loc.gov/METS/',
15
+ }
16
+
17
+ def extract_cataloging_convention xml
18
+ 'ds-mets'
19
+ end
20
+
21
+ # Extracts the institution name from the given XML document.
22
+ #
23
+ # @param [Nokogiri::XML::Node] xml the XML document to extract the institution name from
24
+ # @return [String] the extracted institution name
25
+ def extract_institution_name xml
26
+ extract_mets_creator(xml).first
27
+ end
28
+
29
+ # Extracts the creator information from the METS XML document.
30
+ #
31
+ # @param [Nokogiri::XML::Node] xml the XML document containing METS data
32
+ # @return [Array<String>] an array of creator information
33
+ def extract_mets_creator xml
34
+ creator = xml.xpath('/mets:mets/mets:metsHdr/mets:agent[@ROLE="CREATOR" and @TYPE="ORGANIZATION"]/mets:name', NS).text
35
+ creator.split %r{;;}
36
+ end
37
+
38
+ ##
39
+ # Extract and format all the physical description values for the
40
+ # manuscript and each part.
41
+ #
42
+ # # MS Note Phys desc
43
+ #
44
+ # - presentation -> Binding
45
+ #
46
+ # # MS Part phys description
47
+ #
48
+ # - support -- accounted for as support
49
+ #
50
+ # - marks - 'Watermarks'
51
+ # - medium -> 'Music'
52
+ # - physical description -> 'Other decoration'
53
+ # - physical details -> 'Figurative details'
54
+ # - script -> 'Script'
55
+ # - technique -> 'Layout'
56
+ #
57
+ # @param [Nokogiri::XML::Node] xml the document's xml
58
+ # @return [Array] the physical description values
59
+ def extract_physical_description xml
60
+ physdesc = []
61
+ physdesc += extract_ms_phys_desc xml
62
+ physdesc += extract_part_phys_desc xml
63
+ physdesc.flatten!
64
+
65
+ clean_notes physdesc
66
+ end
67
+
68
+ # Extracts the physical description notes from the given node based on the note type and optional tag.
69
+ #
70
+ # @param [Nokogiri::XML::Node] node the XML node to extract notes from
71
+ # @param [Symbol] note_type the type of note to extract
72
+ # @param [String] tag an optional tag to prepend to each extracted note
73
+ # @return [Array<String>] an array of extracted notes
74
+ def physdesc_note node, note_type, tag: nil
75
+ if note_type == :none
76
+ xpath = %q{mods:mods/mods:physicalDescription/mods:note[not(@type)]}
77
+ else
78
+ xpath = %Q{mods:mods/mods:physicalDescription/mods:note[@type = '#{note_type}']}
79
+ end
80
+
81
+ node.xpath(xpath).map { |x|
82
+ tag.nil? ? x.text : "#{tag}: #{x.text}"
83
+ }
84
+ end
85
+
86
+
87
+
88
+ def extract_ms_phys_desc xml
89
+ ms = find_ms xml
90
+ physdesc_note ms, 'presentation', tag: 'Binding'
91
+ end
92
+
93
+ # Extracts physical description notes from the given part object.
94
+ #
95
+ # @param [Nokogiri::XML::Node] part the XML node representing the part
96
+ # @return [Array<String>] an array of extracted physical description notes
97
+ def extract_pd_note part
98
+ extent = extract_extent part
99
+
100
+ xpath = %q{mods:mods/mods:physicalDescription/mods:note[@type = 'physical description']/text()}
101
+ part.xpath(xpath).flat_map { |node|
102
+ text = node.text
103
+ notes = []
104
+ if text =~ %r{;;}
105
+ other_deco, num_scribes = text.split %r{;;+}
106
+ notes << "Other decoration, #{extent}: #{other_deco}" unless other_deco.blank?
107
+ notes << "Number of scribes, #{extent}: #{num_scribes}" unless num_scribes.blank?
108
+ else
109
+ notes << "Other decoration, #{extent}: #{text}" unless text.empty?
110
+ end
111
+ notes
112
+ }
113
+ end
114
+
115
+ # Extracts physical description notes for each part in the XML.
116
+ #
117
+ # @param [Nokogiri::XML::Node] xml the XML node to extract parts from
118
+ # @return [Array<String>] an array of extracted physical description notes
119
+ def extract_part_phys_desc xml
120
+ parts = find_parts xml
121
+ parts.flat_map { |part|
122
+ extent = extract_extent part
123
+ notes = []
124
+
125
+ tag = "Figurative details, #{extent}"
126
+ notes += physdesc_note part, 'physical details', tag: tag
127
+ notes += extract_pd_note part
128
+ tag = "Script, #{extent}"
129
+ notes += physdesc_note part, 'script', tag: tag
130
+ tag = "Music, #{extent}"
131
+ notes += physdesc_note part, 'medium', tag: tag
132
+ tag = "Layout, #{extent}"
133
+ notes += physdesc_note part, 'technique', tag: tag
134
+ tag = "Watermarks, #{extent}"
135
+ notes += physdesc_note part, 'marks', tag: tag
136
+ notes
137
+ }
138
+ end
139
+
140
+ ##
141
+ # DS 1.0 METS note types:
142
+ #
143
+ # # MS Note types:
144
+ #
145
+ # Accounted for
146
+ # - ownership -- accounted for, former owner
147
+ # - action -- skip; administrative note: "Inputter ...."
148
+ # - admin -- acknowledgments
149
+ # - untyped -- 'Manuscript Note'
150
+ # - bibliography -- 'Bibliography'
151
+ # - source note -- skip; not present on DS legacy pages
152
+ #
153
+ #
154
+ # # MS Note Phys desc
155
+ #
156
+ # - presentation -> Binding
157
+ #
158
+ # # Part note types:
159
+ #
160
+ # - date - already accounted for
161
+ # - content - skip
162
+ # - admin - Acknowledgments
163
+ #
164
+ # - untyped
165
+ #
166
+ # # MS Part phys description
167
+ #
168
+ # - support -- accounted for as support
169
+ #
170
+ # - marks - 'Watermarks'
171
+ # - medium -> 'Music'
172
+ # - physical description -> 'Other decoration'
173
+ # - physical details -> 'Figurative details'
174
+ # - script -> 'Script'
175
+ # - technique -> 'Layout'
176
+ #
177
+ # # Text note types
178
+ #
179
+ # Accounted for
180
+ # - admin - acknowledgments
181
+ #
182
+ # - condition -> 'Status of text'
183
+ # - content -> handled as Text Incipit
184
+ # - untyped -> 'Text note'
185
+ #
186
+ # # Page note types
187
+ #
188
+ # Accounted for
189
+ # None
190
+ #
191
+ # - content -> Folio Incipit
192
+ # - date -- skip
193
+ # - untyped -> 'Folio note'
194
+ #
195
+ def note_by_type node, note_type, tag: nil
196
+ if note_type == :none
197
+ xpath = %q{mods:mods/mods:note[not(@type)]/text()}
198
+ else
199
+ xpath = %Q{mods:mods/mods:note[@type = '#{note_type}']/text()}
200
+ end
201
+
202
+ node.xpath(xpath).map { |x|
203
+ tag.nil? ? x.text : "#{tag}: #{x.text}"
204
+ }
205
+ end
206
+
207
+ # Extracts the extent from the given node.
208
+ #
209
+ # @param [Nokogiri::XML::Node] node the XML node to extract extent from
210
+ # @return [String] the extracted extent
211
+ def extract_extent node
212
+ xpath = 'mods:mods/mods:physicalDescription/mods:extent'
213
+ node.xpath(xpath).flat_map { |extent|
214
+ extent.text.split(%r{;;}).first
215
+ }.join ', '
216
+ end
217
+
218
+ # Extracts the material as recorded from the given record.
219
+ #
220
+ # @param [CSV::Row] record the record to extract material from
221
+ # @return [String] the extracted material as recorded
222
+ def extract_material_as_recorded record
223
+ extract_materials(record).map(&:as_recorded).join '|'
224
+ end
225
+
226
+ # Extracts materials from the given record.
227
+ #
228
+ # @param [Object] record the record to extract materials from
229
+ # @return [Array<DS::Extractor::Material>] an array of Material objects
230
+ def extract_materials record
231
+ find_parts(record).flat_map { |part|
232
+ physdesc_note part, 'support'
233
+ }.map { |s|
234
+ s.downcase.chomp('.').strip
235
+ }.uniq.map { |as_recorded|
236
+ DS::Extractor::Material.new as_recorded: as_recorded
237
+ }
238
+ end
239
+
240
+ # Extracts former owners as recorded from the given XML.
241
+ #
242
+ # @param [Nokogiri::XML::NodeSet] xml the parsed XML to extract former owners from
243
+ # @param [Boolean] lookup_split whether to lookup split information or not
244
+ # @return [Array<String>] the extracted former owners as recorded
245
+ def extract_former_owners_as_recorded xml, lookup_split: true
246
+ extract_former_owners(xml).map &:as_recorded
247
+ end
248
+
249
+ # Extracts former owners from the given record.
250
+ #
251
+ # @param [Nokogiri::XML::Node] record the XML node representing the record
252
+ # @return [Array<DS::Extractor::Name>] an array of extracted former owners
253
+ def extract_former_owners record
254
+ xpath = "./descendant::mods:note[@type='ownership']/text()"
255
+ notes = clean_notes(record.xpath(xpath).flat_map(&:text))
256
+
257
+ notes.flat_map { |n|
258
+ splits = Recon::Type::Splits._lookup_single(n, from_column: 'authorized_label')
259
+ splits.present? ? splits.split('|') : n
260
+ }.map { |n|
261
+ DS::Extractor::Name.new as_recorded: DS.mark_long(n), role: 'former owner'
262
+ }
263
+ end
264
+
265
+ # Extracts authors from the given record.
266
+ #
267
+ # @param [Object] record the record to extract authors from
268
+ # @return [Array<DS::Extractor::Name>] an array of extracted authors
269
+ def extract_authors record
270
+ DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ author [author] }
271
+ end
272
+
273
+ # Extracts authors as recorded from the given record.
274
+ #
275
+ # @param [Object] record the record to extract authors from
276
+ # @return [Array<String>] the extracted authors as recorded
277
+ def extract_authors_as_recorded record
278
+ extract_authors(record).map &:as_recorded
279
+ end
280
+
281
+ # Extracts artists as recorded from the given record.
282
+ #
283
+ # @param [Object] record the record to extract artists
284
+ def extract_artists_as_recorded record
285
+ extract_artists(record).map &:as_recorded
286
+ end
287
+
288
+ # Extracts artists from the given record using the specified type and role.
289
+ #
290
+ # @param [Object] record the record to extract artists from
291
+ # @return [Array<DS::Extractor::Name>] an array of extracted artists
292
+ def extract_artists record
293
+ DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ artist [artist] illuminator }
294
+ end
295
+
296
+ # Extracts scribes as recorded from the given record.
297
+ #
298
+ # @param [Object] record the record to extract scribes from
299
+ # @return [Array<String>] the extracted scribes as recorded
300
+ def extract_scribes_as_recorded record
301
+ extract_scribes(record).map &:as_recorded
302
+ end
303
+
304
+ # Extract scribes from the given record.
305
+ #
306
+ # @param record [Object] the record to extract scribes from
307
+ # @return [Array<String>] the extracted scribes
308
+ def extract_scribes record
309
+ DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ scribe [scribe] }
310
+ end
311
+
312
+ # Extract other names as recorded from the given record.
313
+ #
314
+ # @param record [Object] the record to extract other names from
315
+ # @return [Array<String>] the extracted other names as recorded
316
+ def extract_other_names_as_recorded record
317
+ extract_associated_agents(record).map &:as_recorded
318
+ end
319
+
320
+ # Extract other names from the given record.
321
+ #
322
+ # @param record [Object] the record to extract other names from
323
+ # @return [Array<String>] the extracted other names
324
+ def extract_associated_agents record
325
+ DS::Extractor::DsMetsXmlExtractor.extract_name record, 'other'
326
+ end
327
+
328
+ ##
329
+ # Return a list of unique languages from the text-level <mods:note>s
330
+ # that start with "lang:" (case -insensitive), joined with separator;
331
+ # so, "Latin", rather than "Latin|Latin|Latin", etc.
332
+ #
333
+ # @return [String]
334
+ def extract_languages_as_recorded record
335
+ extract_languages(record).map &:as_recorded
336
+ end
337
+
338
+ # Extract languages from the given record.
339
+ #
340
+ # @param record [Object] the record to extract languages from
341
+ # @return [Array<DS::Extractor::Language>] the extracted languages
342
+ def extract_languages record
343
+ # /mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/mods:mods/mods:note
344
+ # Can be Lang: or lang: or ???, so down case the text with translate()
345
+ xpath = './descendant::mods:note[starts-with(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "lang:")]'
346
+ find_texts(record).flat_map { |text|
347
+ text.xpath(xpath).map { |note| note.text.sub(%r{^lang:\s*}i, '') }
348
+ }.uniq.map { |as_recorded|
349
+ DS::Extractor::Language.new as_recorded: as_recorded
350
+ }
351
+ end
352
+
353
+ # Extract name from the given node based on the provided roles.
354
+ #
355
+ # @param node [Object] the node to extract name from
356
+ # @param roles [Array<String>] the roles to search for
357
+ # @return [Array<DS::Extractor::Name>] the extracted names
358
+ def extract_name node, *roles
359
+ # Roles have different cases: Author, author, etc.
360
+ # Xpath 1.0 has no lower-case function, so use translate()
361
+ translate = "translate(./mods:role/mods:roleTerm/text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
362
+ props = roles.map { |r| "#{translate} = '#{r}'" }.join ' or '
363
+ xpath = "./descendant::mods:name[#{props}]"
364
+ node.xpath(xpath).flat_map { |name|
365
+ name.xpath('mods:namePart').text.split %r{\s*;\s*}
366
+ }.uniq.map { |as_recorded|
367
+ DS::Extractor::Name.new as_recorded: as_recorded, role: roles.first
368
+ }
369
+ end
370
+
371
+ # Extract titles as recorded from the given record.
372
+ #
373
+ # @param record [Object] the record to extract titles from
374
+ # @return [Array<String>] the extracted titles as recorded
375
+ def extract_titles_as_recorded record
376
+ extract_titles(record).map &:as_recorded
377
+ end
378
+
379
+ # Extract titles from the given record.
380
+ #
381
+ # @param record [Object] the record to extract titles from
382
+ # @return [Array<DS::Extractor::Title>] the extracted titles
383
+ def extract_titles record
384
+ xpath = 'mods:mods/mods:titleInfo/mods:title'
385
+ find_texts(record).flat_map { |text|
386
+ text.xpath(xpath).map(&:text)
387
+ }.reject {
388
+ |t| t == '[Title not supplied]'
389
+ }.map { |as_recorded|
390
+ DS::Extractor::Title.new as_recorded: as_recorded
391
+ }
392
+ end
393
+
394
+ # Extract production places as recorded from the given XML.
395
+ #
396
+ # @param xml [Object] the XML to extract production places from
397
+ # @return [Array<String>] the extracted production places as recorded
398
+ def extract_production_places_as_recorded xml
399
+ extract_places(xml).map &:as_recorded
400
+ end
401
+
402
+ ##
403
+ # Extract the places of production for reconciliation CSV output.
404
+ #
405
+ # Returns a two-dimensional array, each row is a place; and each row has
406
+ # one column: place name; for example:
407
+ #
408
+ # [["Austria"],
409
+ # ["Germany"],
410
+ # ["France (?)"]]
411
+ #
412
+ # @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
413
+ # @return [Array<Array>] an array of arrays of values
414
+ def extract_recon_places xml
415
+ extract_places(xml).map &:to_a
416
+ end
417
+
418
+ # Extract reconciliation titles from the given XML.
419
+ #
420
+ # @param xml [Nokogiri::XML::Node] a +<METS_XML>+ node
421
+ # @return [Array<String>] an array of titles for reconciliation
422
+ def extract_recon_titles xml
423
+ extract_titles(xml).to_a
424
+ end
425
+
426
+ # Extract reconciliation names from the given XML.
427
+ #
428
+ # @param xml [Nokogiri::XML::Node] a +<METS_XML>+ node
429
+ # @return [Array<Array>] an array of arrays of names for reconciliation
430
+ def extract_recon_names xml
431
+ data = extract_authors(xml).map &:to_a
432
+ data += extract_artists(xml).map &:to_a
433
+ data += extract_scribes(xml).map &:to_a
434
+ data += extract_former_owners(xml).map &:to_a
435
+ data += extract_associated_agents(xml).map &:to_a
436
+ data
437
+ end
438
+
439
+ ##
440
+ # Extract acknowledgments, notes, physical descriptions, and
441
+ # former owners; return all strings that start with SPLIT:,
442
+ # remove 'SPLIT: ' and return an array of arrays that can
443
+ # be treated as rows by Recon::Type::Splits
444
+ def extract_recon_splits xml
445
+ data = []
446
+ data += DS::Extractor::DsMetsXmlExtractor.extract_former_owners_as_recorded xml, lookup_split: false
447
+ data.flatten.select { |d| d.to_s.size >= 400 }.map { |d| [d.strip] }
448
+ end
449
+
450
+ ##
451
+ # For the legacy DS METS, this value is the value of
452
+ # +mods:identifier[@type="local"]+ is the shelf mark. If there are other
453
+ # ID types, we can't distinguish them from shelfmarks.
454
+ #
455
+ # @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
456
+ # @return [String] the shelfmark
457
+ def extract_shelfmark xml
458
+ ms = find_ms xml
459
+ ms.xpath('mods:mods/mods:identifier[@type="local"]/text()').text
460
+ end
461
+
462
+ def extract_genres xml
463
+ []
464
+ end
465
+
466
+ ##
467
+ # See the note for [Recon::Type::Subjects]: Each source subject extraction
468
+ # method should return a two dimensional array:
469
+ #
470
+ # [["Islamic law--Early works to 1800", ""],
471
+ # ["Malikites--Early works to 1800", ""],
472
+ # ["Islamic law", ""],s
473
+ # ["Malikites", ""],
474
+ # ["Arabic language--Grammar--Early works to 1800", ""],
475
+ # ["Arabic language--Grammar", ""],
476
+ # ...
477
+ # ]
478
+ #
479
+ # The second value is for those cases where the source provides an
480
+ # authority URI. The METS records don't give a URI so this method always
481
+ # returns the empty string for the second value.
482
+ #
483
+ # @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
484
+ # @return [Array<String,String>] a two-dimenional array of subject and URI
485
+ def extract_recon_subjects xml
486
+ extract_subjects(xml).map &:to_a
487
+ end
488
+
489
+ ##
490
+ # Extract subjects, the `mods:originInfo/mods:edition` values for each
491
+ # text. For example,
492
+ #
493
+ # <mods:originInfo>
494
+ # <mods:edition>Alexander, de Villa Dei.</mods:edition>
495
+ # <mods:edition>Latin language--Grammar.</mods:edition>
496
+ # <mods:edition>Latin poetry, Medieval and modern.</mods:edition>
497
+ # <mods:edition>Manuscripts, Medieval--Connecticut--New Haven.</mods:edition>
498
+ # </mods:originInfo>
499
+ #
500
+ # @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
501
+ # @return [Array<String>] an of subjects
502
+ def extract_subjects_as_recorded xml
503
+ extract_subjects(xml).map(&:as_recorded)
504
+ end
505
+
506
+ # Extract all subjects as recorded from the given XML.
507
+ #
508
+ # @param xml [Nokogiri::XML::Node] the XML to extract subjects from
509
+ # @return [Array<String>] the extracted subjects as recorded
510
+ def extract_all_subjects_as_recorded xml
511
+ extract_subjects_as_recorded xml
512
+ end
513
+
514
+ # Extract link to institution record from the given XML.
515
+ #
516
+ # @param xml [Nokogiri::XML::Node] the XML to extract the link from
517
+ # @return [String] the extracted link to the institution record
518
+ def extract_link_to_inst_record xml
519
+ ms = find_ms xml
520
+ # xpath mods:mods/mods:relatedItem/mods:location/mods:url
521
+ xpath = "mods:mods/mods:relatedItem/mods:location/mods:url"
522
+ ms.xpath(xpath).map(&:text).join '|'
523
+ end
524
+
525
+ # Determines if the XML document is dated by a scribe.
526
+ #
527
+ # @param [Nokogiri::XML:Node] xml the XML document to check
528
+ # @return [Boolean] true if the document is dated by a scribe, false otherwise
529
+ def dated_by_scribe? xml
530
+ parts = find_parts xml
531
+ # mods:mods/mods:note
532
+ xpath = 'mods:mods/mods:note[@type="date"]'
533
+ parts.any? { |part|
534
+ part.xpath(xpath).text.upcase == 'Y'
535
+ }
536
+ end
537
+
538
+ ##
539
+ # Return as a single string all the date values for the manuscript. This
540
+ # is a concatenation of the values returned by DS10.extract_date_created,
541
+ # DS10.extract_assigned_date, DS10.extract_date_range.
542
+ #
543
+ # @param [Nokogiri::XML:Node] xml the parsed METS xml document
544
+ # @return [Array<String>] the concatenated date values
545
+ def extract_production_date_as_recorded xml
546
+ find_parts(xml).map { |part|
547
+ date_created = extract_date_created part
548
+ assigned = extract_assigned_date part
549
+ range = extract_date_range_for_part(part).join '-'
550
+ [date_created, assigned, range].uniq.reject(&:empty?).join '; '
551
+ }.reject { |date| date.to_s.strip.empty? }
552
+ end
553
+
554
+ ##
555
+ # Extract ranges from `mods:dateCreated` elements where a @point is
556
+ # defined, thus:
557
+ #
558
+ # <mods:dateCreated point="start" encoding="iso8601">1300</mods:dateCreated>
559
+ # <mods:dateCreated point="end" encoding="iso8601">1399</mods:dateCreated>
560
+ #
561
+ # @param [Nokogiri::XML:Node] part a part-level node
562
+ # @return [Array<String>] the start and end dates as an array of integers
563
+ def extract_date_range xml, range_sep:
564
+ find_parts(xml).map { |part|
565
+ extract_date_range_for_part(part).join range_sep
566
+ }
567
+ end
568
+
569
+ DATE_START_XPATH = 'mods:mods/mods:originInfo/mods:dateCreated[@point="start"]'
570
+ DATE_END_XPATH = 'mods:mods/mods:originInfo/mods:dateCreated[@point="end"]'
571
+
572
+ ##
573
+ # Extract ranges from `mods:dateCreated` elements where a @point is
574
+ # start and end
575
+ #
576
+ # @param [Nokogiri::XML:Node] part a part-level node
577
+ # @return [Array<Integer>] the start and end dates as an array of integers
578
+ def extract_date_range_for_part part
579
+ start_date = part.xpath(DATE_START_XPATH).text
580
+ end_date = part.xpath(DATE_END_XPATH).text
581
+ [start_date, end_date].reject(&:empty?).map(&:to_i)
582
+ end
583
+
584
+ ##
585
+ # Return any date not found in the `otherDate` or in a dateCreated date
586
+ # range (see #extract_date_range); thus:
587
+ #
588
+ # <mods:dateCreated>1537</mods:dateCreated>
589
+ # <mods:dateCreated>1531</mods:dateCreated>
590
+ # <mods:dateCreated>14??, October 21</mods:dateCreated>
591
+ # <mods:dateCreated>1462, July 23</mods:dateCreated>
592
+ # <mods:dateCreated>1549, November</mods:dateCreated>
593
+ #
594
+ # These values commonly give the date for "dated" manuscripts
595
+ #
596
+ # @param [Nokogiri::XML:Node] part a part-level node
597
+ # @return [Array<Integer>] the content of any dateCreated without '@point'
598
+ # defined
599
+ def extract_date_created part
600
+ xpath = 'mods:mods/mods:originInfo/mods:dateCreated[not(@point)]'
601
+ part.xpath(xpath).map(&:text).join ', '
602
+ end
603
+
604
+ ##
605
+ # Return dates found in the `otherDate` element, reformatting them as
606
+ # needed. These examples are taken from several METS files.
607
+ #
608
+ # <mods:dateOther>[ca. 1410]</mods:dateOther>
609
+ # <mods:dateOther>[between 1100 and 1200]</mods:dateOther>
610
+ # <mods:dateOther>[between 1450 and 1460]</mods:dateOther>
611
+ # <mods:dateOther>[between 1450 and 1500]</mods:dateOther>
612
+ # <mods:dateOther>s. XV#^3/4#</mods:dateOther>
613
+ # <mods:dateOther>s. XV</mods:dateOther>
614
+ # <mods:dateOther>s. XVI#^4/4#</mods:dateOther>
615
+ # <mods:dateOther>s. XVIII#^2/4#</mods:dateOther>
616
+ # <mods:dateOther>s. XV#^in#</mods:dateOther>
617
+ #
618
+ # Most dateOther values have the format:
619
+ #
620
+ # s. XVII#^2#
621
+ #
622
+ # The notation #^<VAL># encodes a portion of the string that was presented
623
+ # as superscript on the Berkeley DS site. DS 2.0 doesn't use the
624
+ # superscripts; thus, when it occurs, this portion of the string is
625
+ # reformatted `(<VAL>)`:
626
+ #
627
+ # s. XVII#^2# => s. XVII(2)
628
+ # s. XV#^ex# => s. XV(ex)
629
+ # s. XVI#^in# => s. XVI(in)
630
+ # s. X#^med# => s. X(med)
631
+ # s. XII#^med# => s. XII(med)
632
+ #
633
+ # @param [Nokogiri::XML:Node] part a part-level node
634
+ # @return [Array<Integer>] the date string reformatted as described above
635
+ def extract_assigned_date part
636
+ xpath = 'mods:mods/mods:originInfo/mods:dateOther'
637
+ part.xpath(xpath).text.gsub %r{#\^?([\w/]+)(\^|#)}, '(\1)'
638
+ end
639
+
640
+
641
+ # Extracts acknowledgments from the given XML document.
642
+ #
643
+ # @param [Nokogiri::XML::Node] xml the XML document to extract acknowledgments from
644
+ # @return [Array<String>] the extracted acknowledgments
645
+ def extract_acknowledgments xml
646
+ notes = []
647
+ notes += find_ms(xml).flat_map { |ms| note_by_type ms, 'admin' }
648
+
649
+ notes += find_parts(xml).flat_map { |part|
650
+ extent = extract_extent part
651
+ note_by_type part, 'admin', tag: extent
652
+ }
653
+
654
+ notes += find_texts(xml).flat_map { |text|
655
+ extent = extract_extent text
656
+ note_by_type text, 'admin', tag: extent
657
+ }
658
+
659
+ notes += find_pages(xml).flat_map { |page|
660
+ extent = extract_extent page
661
+ note_by_type page, 'admin', tag: extent
662
+ }
663
+
664
+ clean_notes notes
665
+ end
666
+
667
+ ##
668
+ # Extract the filename for page. This will be either:
669
+ #
670
+ # * the values for +mods:identifier+ with +@type='filename'+; or
671
+ #
672
+ # * the filenames pointed to by the linked +mets:fptr+ in the
673
+ # +mets:fileGrp+ with +@USE='image/master'+
674
+ #
675
+ # * an array containing +['NO_FILE']+, if no files are associated with
676
+ # the page
677
+ #
678
+ # There will almost always be one file, but at least one manuscript has
679
+ # page with two associated images. Thus, we return an array.
680
+ #
681
+ # @param [Nokogiri::XML::Node] page the +mets:dmdSec+ node for the page
682
+ # @return [Array<String>] array of all the filenames for +page+
683
+ def extract_filenames page
684
+ # mods:mods/mods:identifier[@type='filename']
685
+ xpath = 'mods:mods/mods:identifier[@type="filename"]'
686
+ filenames = page.xpath(xpath).map(&:text)
687
+ return filenames unless filenames.empty?
688
+
689
+ # no filename; find the ARK URL for the master image for this page
690
+ extract_master_mets_file page
691
+ end
692
+
693
+ # Extracts the folio number from the given page node.
694
+ #
695
+ # @param [Nokogiri::XML::Node] page the XML node representing the page
696
+ # @return [String] the extracted folio number
697
+ def extract_folio_num page
698
+ # mods:mods/mods:physicalDescription/mods:extent
699
+ xpath = 'mods:mods/mods:physicalDescription/mods:extent'
700
+ page.xpath(xpath).map(&:text).join '|'
701
+ end
702
+
703
+ ##
704
+ # In some METS files each page has a list of mets:fptr elements, we need
705
+ # to get the @FILEID for the master image, but we don't know which one is
706
+ # for the master. Thus we get all the @FILEIDs.
707
+ #
708
+ # <mets:structMap>
709
+ # <mets:div TYPE="text" LABEL="[No Title for Display]" ADMID="RMD1" DMDID="DM1">
710
+ # <mets:div TYPE="item" LABEL="[No Title for Display]" DMDID="DM2">
711
+ # <mets:div TYPE="item" LABEL="[No Title for Display]" DMDID="DM3">
712
+ # <mets:div TYPE="item" LABEL="Music extending into right margin, upper right column." DMDID="DM4">
713
+ # <mets:fptr FILEID="FID1"/>
714
+ # <mets:fptr FILEID="FID3"/>
715
+ # <mets:fptr FILEID="FID5"/>
716
+ # <mets:fptr FILEID="FID7"/>
717
+ # <mets:fptr FILEID="FID9"/>
718
+ # </mets:div>
719
+ # <!-- snip -->
720
+ # </mets:div>
721
+ # </mets:div>
722
+ # </mets:div>
723
+ # </mets:structMap>
724
+ #
725
+ # Using the FILEIDs, find the corresponding mets:file in the
726
+ # mets:fileGrp with @USE='image/master'.
727
+ #
728
+ # <mets:fileGrp USE="image/master">
729
+ # <mets:file ID="FID1" MIMETYPE="image/tiff" SEQ="1" CREATED="2010-11-08T10:26:20.3" ADMID="ADM1 ADM4" GROUPID="GID1">
730
+ # <mets:FLocat xlink:href="http://nma.berkeley.edu/ark:/28722/bk0008v1k7q" LOCTYPE="URL"/>
731
+ # </mets:file>
732
+ # <mets:file ID="FID2" MIMETYPE="image/tiff" SEQ="2" CREATED="2010-11-08T10:26:20.393" ADMID="ADM1 ADM5" GROUPID="GID2">
733
+ # <mets:FLocat xlink:href="http://nma.berkeley.edu/ark:/28722/bk0008v1k88" LOCTYPE="URL"/>
734
+ # </mets:file>
735
+ # </mets:fileGrp>
736
+ #
737
+ # We then follow the +xlink:href+ to get the filename from the 'location'
738
+ # HTTP header.
739
+ #
740
+ # @param [Nokogiri::XML::Node] page the +mets:dmdSec+ node for the page
741
+ # @return [Array<String>] array of all the filenames for +page+
742
+ def extract_master_mets_file page
743
+ dmdid = page['ID']
744
+ # all the mets:fptr @FILEIDs for this page
745
+ xpath = %Q{//mets:structMap/descendant::mets:div[@DMDID='#{dmdid}']/mets:fptr/@FILEID}
746
+
747
+ # create an OR query because we don't know which FILEID is for the
748
+ # master mets:file:
749
+ # "@ID = 'FID1' or @ID = 'FID3' or @ID = 'FID5' ... etc."
750
+ id_query = page.xpath(xpath).map(&:text).map { |id| "@ID='#{id}'" }.join ' or '
751
+ return ['NO_FILE'] if id_query.strip.empty? # there is no associated mets:fptr
752
+
753
+ # the @xlink:href is the Berkeley ARK address; e.g., http://nma.berkeley.edu/ark:/28722/bk0008v1k88
754
+ xpath = "//mets:fileGrp[@USE='image/master']/mets:file[#{id_query}]/mets:FLocat/@xlink:href"
755
+ fptr_addresses = page.xpath(xpath).map &:text
756
+ return ['NO_FILE'] if fptr_addresses.empty? # I don't know if this happens, but just in case...
757
+
758
+ # for each ARK address, find the TIFF filename
759
+ fptr_addresses.map { |address| locate_filename address }
760
+ end
761
+
762
+ # Extracts the manuscript note from the given XML.
763
+ #
764
+ # @param [Nokogiri::XML::Node] xml the XML node to extract manuscript note from
765
+ # @return [Array<String>] an array of manuscript notes
766
+ def extract_ms_note xml
767
+ notes = []
768
+ ms = find_ms xml
769
+ notes += note_by_type ms, :none, tag: 'Manuscript note'
770
+ notes += note_by_type ms, 'bibliography', tag: 'Bibliography'
771
+ notes
772
+ end
773
+
774
+ # Extracts notes for each part in the given XML.
775
+ #
776
+ # @param [Nokogiri::XML::Node] xml the XML node to extract notes from
777
+ # @return [Array<String>] an array of extracted notes
778
+ def extract_part_note xml
779
+ find_parts(xml).flat_map { |part|
780
+ extent = extract_extent part
781
+ note_by_type part, :none, tag: extent
782
+ }
783
+ end
784
+
785
+ # Extracts explicit information from the given node based on the provided tag.
786
+ #
787
+ # @param [Nokogiri::XML::Node] node the XML node to extract information from
788
+ # @param [String] tag the tag to prepend to each extracted information
789
+ # @return [Array<String>] an array of extracted information
790
+ def extract_explicit node, tag:
791
+ node.xpath('mods:mods/mods:abstract/text()').map { |n|
792
+ "#{tag}: #{n.text}"
793
+ }
794
+ end
795
+
796
+ # Extracts text notes from the given XML document.
797
+ #
798
+ # @param [Nokogiri::XML::Node] xml the XML document to extract text notes from
799
+ # @return [Array<String>] the extracted text notes
800
+ def extract_text_note xml
801
+ find_texts(xml).flat_map { |text|
802
+ extent = extract_extent text
803
+ notes = []
804
+ notes += note_by_type text, :none, tag: extent
805
+ notes += note_by_type text, 'condition', tag: "Status of text, #{extent}"
806
+ notes += note_by_type text, 'content', tag: "Incipit, #{extent}"
807
+ notes += extract_explicit text, tag: "Explicit, #{extent}"
808
+ notes
809
+ }
810
+ end
811
+
812
+ # Extracts notes for each page in the given XML.
813
+ #
814
+ # @param [Nokogiri::XML::Node] xml the XML node to extract notes from
815
+ # @return [Array<String>] an array of extracted notes
816
+ def extract_page_note xml
817
+ find_pages(xml).flat_map { |page|
818
+ extent = extract_extent page
819
+ notes = []
820
+ notes += note_by_type page, :none, tag: extent
821
+ notes += note_by_type page, 'content', tag: "Incipit, #{extent}"
822
+ notes += extract_explicit page, tag: "Explicit, #{extent}"
823
+ notes
824
+ }
825
+ end
826
+
827
+ ##
828
+ # Extract the notes at all level from the +xml+, and return
829
+ # an array of strings
830
+ #
831
+ # @param [Nokogiri::XML::Node] xml the document's xml
832
+ # @return [Array<String>] the note values
833
+ def extract_notes xml
834
+ notes = []
835
+ # get all notes that don't have @type
836
+ xpath = %q{//mods:note[not(@type)]/text()}
837
+ notes += extract_ms_note xml
838
+ notes += extract_part_note xml
839
+ notes += extract_text_note xml
840
+ notes += extract_docket xml
841
+ notes += extract_page_note xml
842
+
843
+ clean_notes notes
844
+ end
845
+
846
+ ##
847
+ # **If** the +mods:mods+ element has a
848
+ # <tt><mods:titleInfo type="alternative"></tt> element **and** a
849
+ # <tt><mods:abstract[not(@displayLabel)]></tt>, **then** the content of
850
+ # the <tt><mods:abstract[not(@displayLabel)]></tt> is an incipit; XPath:
851
+ #
852
+ #
853
+ # //mods:mods[./mods:titleInfo[@type="alternative"] and ./mods:abstract[not(@displayLabel)]]
854
+ #
855
+ # //mods:mods[./mods:titleInfo[@type="alternative"]]/mods:abstract[not(@displayLabel)]/text()
856
+ #
857
+ #
858
+ # **If** the `mods:mods` element has a `mods:titleInfo type="alternative"` element **and** a `<mods:note type="content">`, **then** the content of the `<mods:note type="content">` is an explicit; XPath:
859
+ #
860
+ # //mods:mods[./mods:titleInfo[@type="alternative"] and ./mods:note[@type="content"]]
861
+ #
862
+ # //mods:mods[./mods:titleInfo[@type="alternative"]]/mods:note[@type="content"]/text()
863
+ #
864
+ def extract_incipit_explicit xml
865
+ # ./descendant::mods:physicalDescription
866
+ # mods:mods/mods:originInfo/mods:place/mods:placeTerm
867
+ # find any mod:mods containing an incipit or explicit
868
+ xpath = %q{//mods:mods[./mods:titleInfo[@type="alternative"] and
869
+ (./mods:abstract[not(@displayLabel)] or
870
+ ./mods:note[@type="content"])]}
871
+
872
+ find_texts(xml).flat_map { |node|
873
+ # return an array for formatted incipits and explicits for this manuscript
874
+ extent = node.xpath('./descendant::mods:physicalDescription/mods:extent/text()', NS).text
875
+ node.xpath('./descendant::mods:abstract[not(@displayLabel)]/text()').map { |inc|
876
+ "Incipit, #{extent}: #{inc}"
877
+ } + node.xpath('./descendant::mods:note[@type="content"]/text()').map { |exp|
878
+ "Explicit, #{extent}: #{exp}"
879
+ }
880
+ }
881
+ end
882
+
883
+ ##
884
+ # DS METS can have +mods:abstract+ elments with +@displayLabel="docket"+.
885
+ # Extract these values and return as an array.
886
+ #
887
+ # @param [Nokogiri::XML::Node] xml the document xml
888
+ # @return [Array<String>] the note values
889
+ def extract_docket xml
890
+ xpath = %q{//mods:abstract[@displayLabel = 'docket']/text()}
891
+ xml.xpath(xpath, NS).map { |docket|
892
+ "Docket: #{docket.text}"
893
+ }
894
+ end
895
+
896
+ ###
897
+ # Recon extractor
898
+ ###
899
+
900
+ # Extracts places from the given record.
901
+ #
902
+ # @param [Object] record the record to extract places from
903
+ # @return [Array<DS::Extractor::Place>] the extracted places
904
+ def extract_places record
905
+ parts = find_parts record
906
+ xpath = 'mods:mods/mods:originInfo/mods:place/mods:placeTerm'
907
+ parts.flat_map { |node|
908
+ node.xpath(xpath).map { |place|
909
+ DS::Extractor::Place.new as_recorded: place.text.split(%r{;;}).join(', ')
910
+ }
911
+ }
912
+ end
913
+
914
+ # Extracts all subjects from the given record.
915
+ #
916
+ # @note method returns {#extract_subjects} to fulfill
917
+ # DS::Extractor contract
918
+ #
919
+ # @param [Object] record the record to extract subjects from
920
+ # @return [Array<DS::Extractor::Subject>] the extracted subjects
921
+ def extract_all_subjects record
922
+ extract_subjects record
923
+ end
924
+
925
+ # Extracts subjects from the given record.
926
+ #
927
+ # @param [Object] record the record to extract subjects from
928
+ # @return [Array<DS::Extractor::Subject>] the extracted subjects
929
+ def extract_subjects record
930
+ xpath = '//mods:originInfo/mods:edition'
931
+ find_texts(record).flat_map { |text|
932
+ text.xpath(xpath).map { |subj|
933
+ as_recorded = subj.text.strip.gsub(/\s+/, ' ')
934
+ DS::Extractor::Subject.new as_recorded: as_recorded, vocab: 'ds-subject'
935
+ }
936
+ }
937
+ end
938
+
939
+ ###
940
+ # METS structMap extraction
941
+ #
942
+ # Extract mods:mods elements by catalog description level:
943
+ # manuscript, manuscript part, text, page, image
944
+ ###
945
+
946
+ def find_ms xml
947
+ # the manuscript is one div deep in the structMap
948
+ # /mets:mets/mets:structMap/mets:div/@DMDID
949
+ xpath = '/mets:mets/mets:structMap/mets:div/@DMDID'
950
+ id = xml.xpath(xpath).first.text
951
+ xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
952
+ end
953
+
954
+ # Find the manuscript parts in the XML document.
955
+ #
956
+ # @param [Nokogiri::XML::Node] xml the parsed XML document
957
+ # @return [Array<Nokogiri::XML::Node>] an array of manuscript parts in the correct order
958
+ def find_parts xml
959
+ # /mets:mets/mets:structMap/mets:div/mets:div/@DMDID
960
+ # manuscripts parts are two divs deep in the structMap
961
+ # We need to get the IDs in order
962
+ xpath = '/mets:mets/mets:structMap/mets:div/mets:div/@DMDID'
963
+ ids = xml.xpath(xpath).map &:text
964
+ # We can't count on the order or the numbering of the mets:dmdSec
965
+ # elements outside of the structMap. Thus, we have to return an
966
+ # array with the parts mets:dmdSec in the correct order.
967
+ ids.map { |id|
968
+ xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
969
+ }
970
+ end
971
+
972
+
973
+ # Find the texts in the XML document.
974
+ #
975
+ # @param [Nokogiri::XML::Node] xml the parsed XML document
976
+ # @return [Array<Nokogiri::XML::Node>] an array of text nodes in the correct order
977
+ def find_texts xml
978
+ # /mets:mets/mets:structMap/mets:div/mets:div/mets:div/@DMDID
979
+ # texts are three divs deep in the structMap
980
+ # We need to get the IDs in order
981
+ xpath = '/mets:mets/mets:structMap/mets:div/mets:div/mets:div/@DMDID'
982
+ ids = xml.xpath(xpath).map &:text
983
+ ids.map { |id|
984
+ xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
985
+ }
986
+ end
987
+
988
+ ##
989
+ # @param [Nokogiri::XML::Node] xml parsed XML of the METS document
990
+ # @return [Arry<Nokogiri::XML::Node>] array of the page-level +mets:dmdSec+
991
+ # nodes
992
+ def find_pages xml
993
+ # /mets:mets/mets:structMap/mets:div/mets:div/mets:div/mets:div/@DMDID
994
+ # the pages are four divs deep in the structMap
995
+ # We need the IDs in order
996
+ xpath = '/mets:mets/mets:structMap/mets:div/mets:div/mets:div/mets:div/@DMDID'
997
+ ids = xml.xpath(xpath).map &:text
998
+ # collect dmdSec's for all the page IDs
999
+ ids.flat_map { |id|
1000
+ xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
1001
+ }
1002
+ end
1003
+
1004
+ # A method to return the date when the source was last modified.
1005
+ # For DS METS we have chosen the date 2021-10-01.
1006
+ # @return [String] "2021-10-01"
1007
+ def source_modified
1008
+ "2021-10-01"
1009
+ end
1010
+
1011
+ protected
1012
+
1013
+ # Returns a key for the IIIF manifest based on the holder and shelfmark.
1014
+ #
1015
+ # @param holder [String] the holder of the IIIF manifest
1016
+ # @param shelfmark [String] the shelfmark of the IIIF manifest
1017
+ # @return [String] the normalized key for the IIIF manifest
1018
+ def iiif_manifest_key holder, shelfmark
1019
+ qid = DS::Institutions.find_qid holder
1020
+ raise DSError, "No QID found for #{holder}" if qid.blank?
1021
+ normalize_key qid, shelfmark
1022
+ end
1023
+
1024
+
1025
+ # Returns a normalized key by joining and downcasing the input strings and removing whitespace.
1026
+ # @param strings [Array<String>] the strings to join and normalize
1027
+ # @return [String] the normalized key
1028
+ def normalize_key *strings
1029
+ strings.join.downcase.gsub(%r{\s+}, '')
1030
+ end
1031
+
1032
+ # A method to clean and process notes by removing whitespace, skipping notes with specific prefixes, and adding periods to notes without terminal punctuation.
1033
+ #
1034
+ # @param notes [Array<String>] the array of notes to be cleaned and processed
1035
+ # @return [Array<String>] the cleaned and processed notes as an array
1036
+ def clean_notes notes
1037
+ notes.flat_map { |note|
1038
+ # get node text and clean whitespace
1039
+ note.to_s.strip.gsub(%r{\s+}, ' ')
1040
+ }.uniq.reject { |note|
1041
+ # skip notes with prefixes like 'lang: '
1042
+ note.to_s =~ %r{\blang:\s*}i
1043
+ }.map { |note|
1044
+ # add period to any note without terminal punctuation: .,;:? or !
1045
+ DS::Util.terminate(note, terminator: '.', force: true)
1046
+ }
1047
+ end
1048
+
1049
+ @@ark_cache = nil
1050
+
1051
+ ##
1052
+ # Rather than follow the ARK URLs to retrieve the locations, use a
1053
+ # cache that maps the arks to the TIFF filenames.
1054
+ #
1055
+ # Cache format:
1056
+ #
1057
+ # http://nma.berkeley.edu/ark:/28722/bk00091894z|dummy_MoConA_0000068.tif
1058
+ # http://nma.berkeley.edu/ark:/28722/bk00091895h|dummy_MoConA_0000069.tif
1059
+ # http://nma.berkeley.edu/ark:/28722/bk000918b51|dummy_MoConA_0000070.tif
1060
+ # http://nma.berkeley.edu/ark:/28722/bk000918b6k|dummy_MoConA_0000071.tif
1061
+ #
1062
+ # This method lazily initializes a hash that maps the URL to the file name.
1063
+ #
1064
+ # @param [String] address the ark URL; e.g.,
1065
+ # +http://nma.berkeley.edu/ark:/28722/bk000919772+
1066
+ # @return [String] the filename associated with +address+ or +nil+
1067
+ def search_ark_cache address
1068
+ if @@ark_cache.nil?
1069
+ STDERR.puts "Creating ARK cache"
1070
+ path = File.expand_path '../data/berkeley-arks.txt', __FILE__
1071
+ @@ark_cache = File.readlines(path).inject({}) { |h, line|
1072
+ ark, filename = line.strip.split '|'
1073
+ h.update({ ark => filename })
1074
+ }
1075
+ end
1076
+ @@ark_cache[address]
1077
+ end
1078
+
1079
+ ##
1080
+ # Extract filename by following DS ARK URL (e.g.,
1081
+ # +http://nma.berkeley.edu/ark:/28722/bk000855n2z+). We can't get
1082
+ # the image, but we can get the filename from the redirect location
1083
+ # header. As soon as we get a location that ends in +.tif+, we extract
1084
+ # the basename and return it.
1085
+ #
1086
+ # We limit the number of redirects to 4 to prevent infinite recursion
1087
+ # following redirects. We should always get the filename in the first
1088
+ # call.
1089
+ #
1090
+ # @param [String] address ARK address of an image file
1091
+ # @param [Integer] limit decrementing count of recursive calls; stops
1092
+ # at +0+
1093
+ # @return [String] the basename of the first +.tif+ file encountered
1094
+ def locate_filename address, limit = 4
1095
+ # Before hitting the web, try the ARK/URL to FILE cache
1096
+ return search_ark_cache address if search_ark_cache address
1097
+
1098
+ STDERR.puts "WARNING -- recursion: location='#{address}', limit=#{limit}" if limit < 4
1099
+ return if limit == 0
1100
+
1101
+ resp = Net::HTTP.get_response URI address
1102
+ location = resp['location']
1103
+ return if location.nil?
1104
+ # recurse if location isn't a TIFF file
1105
+ return locate_filename location, limit - 1 unless location =~ %r{\.tif$}
1106
+
1107
+ File.basename URI(location).path
1108
+ end
1109
+ end
1110
+
1111
+ self.extend ClassMethods
1112
+ end
1113
+ end
1114
+ end