ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,1114 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
##
|
6
|
+
# Module with class methods for working with DS10 METS XML.
|
7
|
+
module DS
|
8
|
+
module Extractor
|
9
|
+
module DsMetsXmlExtractor
|
10
|
+
module ClassMethods
|
11
|
+
|
12
|
+
NS = {
|
13
|
+
mods: 'http://www.loc.gov/mods/v3',
|
14
|
+
mets: 'http://www.loc.gov/METS/',
|
15
|
+
}
|
16
|
+
|
17
|
+
def extract_cataloging_convention xml
|
18
|
+
'ds-mets'
|
19
|
+
end
|
20
|
+
|
21
|
+
# Extracts the institution name from the given XML document.
|
22
|
+
#
|
23
|
+
# @param [Nokogiri::XML::Node] xml the XML document to extract the institution name from
|
24
|
+
# @return [String] the extracted institution name
|
25
|
+
def extract_institution_name xml
|
26
|
+
extract_mets_creator(xml).first
|
27
|
+
end
|
28
|
+
|
29
|
+
# Extracts the creator information from the METS XML document.
|
30
|
+
#
|
31
|
+
# @param [Nokogiri::XML::Node] xml the XML document containing METS data
|
32
|
+
# @return [Array<String>] an array of creator information
|
33
|
+
def extract_mets_creator xml
|
34
|
+
creator = xml.xpath('/mets:mets/mets:metsHdr/mets:agent[@ROLE="CREATOR" and @TYPE="ORGANIZATION"]/mets:name', NS).text
|
35
|
+
creator.split %r{;;}
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# Extract and format all the physical description values for the
|
40
|
+
# manuscript and each part.
|
41
|
+
#
|
42
|
+
# # MS Note Phys desc
|
43
|
+
#
|
44
|
+
# - presentation -> Binding
|
45
|
+
#
|
46
|
+
# # MS Part phys description
|
47
|
+
#
|
48
|
+
# - support -- accounted for as support
|
49
|
+
#
|
50
|
+
# - marks - 'Watermarks'
|
51
|
+
# - medium -> 'Music'
|
52
|
+
# - physical description -> 'Other decoration'
|
53
|
+
# - physical details -> 'Figurative details'
|
54
|
+
# - script -> 'Script'
|
55
|
+
# - technique -> 'Layout'
|
56
|
+
#
|
57
|
+
# @param [Nokogiri::XML::Node] xml the document's xml
|
58
|
+
# @return [Array] the physical description values
|
59
|
+
def extract_physical_description xml
|
60
|
+
physdesc = []
|
61
|
+
physdesc += extract_ms_phys_desc xml
|
62
|
+
physdesc += extract_part_phys_desc xml
|
63
|
+
physdesc.flatten!
|
64
|
+
|
65
|
+
clean_notes physdesc
|
66
|
+
end
|
67
|
+
|
68
|
+
# Extracts the physical description notes from the given node based on the note type and optional tag.
|
69
|
+
#
|
70
|
+
# @param [Nokogiri::XML::Node] node the XML node to extract notes from
|
71
|
+
# @param [Symbol] note_type the type of note to extract
|
72
|
+
# @param [String] tag an optional tag to prepend to each extracted note
|
73
|
+
# @return [Array<String>] an array of extracted notes
|
74
|
+
def physdesc_note node, note_type, tag: nil
|
75
|
+
if note_type == :none
|
76
|
+
xpath = %q{mods:mods/mods:physicalDescription/mods:note[not(@type)]}
|
77
|
+
else
|
78
|
+
xpath = %Q{mods:mods/mods:physicalDescription/mods:note[@type = '#{note_type}']}
|
79
|
+
end
|
80
|
+
|
81
|
+
node.xpath(xpath).map { |x|
|
82
|
+
tag.nil? ? x.text : "#{tag}: #{x.text}"
|
83
|
+
}
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
def extract_ms_phys_desc xml
|
89
|
+
ms = find_ms xml
|
90
|
+
physdesc_note ms, 'presentation', tag: 'Binding'
|
91
|
+
end
|
92
|
+
|
93
|
+
# Extracts physical description notes from the given part object.
|
94
|
+
#
|
95
|
+
# @param [Nokogiri::XML::Node] part the XML node representing the part
|
96
|
+
# @return [Array<String>] an array of extracted physical description notes
|
97
|
+
def extract_pd_note part
|
98
|
+
extent = extract_extent part
|
99
|
+
|
100
|
+
xpath = %q{mods:mods/mods:physicalDescription/mods:note[@type = 'physical description']/text()}
|
101
|
+
part.xpath(xpath).flat_map { |node|
|
102
|
+
text = node.text
|
103
|
+
notes = []
|
104
|
+
if text =~ %r{;;}
|
105
|
+
other_deco, num_scribes = text.split %r{;;+}
|
106
|
+
notes << "Other decoration, #{extent}: #{other_deco}" unless other_deco.blank?
|
107
|
+
notes << "Number of scribes, #{extent}: #{num_scribes}" unless num_scribes.blank?
|
108
|
+
else
|
109
|
+
notes << "Other decoration, #{extent}: #{text}" unless text.empty?
|
110
|
+
end
|
111
|
+
notes
|
112
|
+
}
|
113
|
+
end
|
114
|
+
|
115
|
+
# Extracts physical description notes for each part in the XML.
|
116
|
+
#
|
117
|
+
# @param [Nokogiri::XML::Node] xml the XML node to extract parts from
|
118
|
+
# @return [Array<String>] an array of extracted physical description notes
|
119
|
+
def extract_part_phys_desc xml
|
120
|
+
parts = find_parts xml
|
121
|
+
parts.flat_map { |part|
|
122
|
+
extent = extract_extent part
|
123
|
+
notes = []
|
124
|
+
|
125
|
+
tag = "Figurative details, #{extent}"
|
126
|
+
notes += physdesc_note part, 'physical details', tag: tag
|
127
|
+
notes += extract_pd_note part
|
128
|
+
tag = "Script, #{extent}"
|
129
|
+
notes += physdesc_note part, 'script', tag: tag
|
130
|
+
tag = "Music, #{extent}"
|
131
|
+
notes += physdesc_note part, 'medium', tag: tag
|
132
|
+
tag = "Layout, #{extent}"
|
133
|
+
notes += physdesc_note part, 'technique', tag: tag
|
134
|
+
tag = "Watermarks, #{extent}"
|
135
|
+
notes += physdesc_note part, 'marks', tag: tag
|
136
|
+
notes
|
137
|
+
}
|
138
|
+
end
|
139
|
+
|
140
|
+
##
|
141
|
+
# DS 1.0 METS note types:
|
142
|
+
#
|
143
|
+
# # MS Note types:
|
144
|
+
#
|
145
|
+
# Accounted for
|
146
|
+
# - ownership -- accounted for, former owner
|
147
|
+
# - action -- skip; administrative note: "Inputter ...."
|
148
|
+
# - admin -- acknowledgments
|
149
|
+
# - untyped -- 'Manuscript Note'
|
150
|
+
# - bibliography -- 'Bibliography'
|
151
|
+
# - source note -- skip; not present on DS legacy pages
|
152
|
+
#
|
153
|
+
#
|
154
|
+
# # MS Note Phys desc
|
155
|
+
#
|
156
|
+
# - presentation -> Binding
|
157
|
+
#
|
158
|
+
# # Part note types:
|
159
|
+
#
|
160
|
+
# - date - already accounted for
|
161
|
+
# - content - skip
|
162
|
+
# - admin - Acknowledgments
|
163
|
+
#
|
164
|
+
# - untyped
|
165
|
+
#
|
166
|
+
# # MS Part phys description
|
167
|
+
#
|
168
|
+
# - support -- accounted for as support
|
169
|
+
#
|
170
|
+
# - marks - 'Watermarks'
|
171
|
+
# - medium -> 'Music'
|
172
|
+
# - physical description -> 'Other decoration'
|
173
|
+
# - physical details -> 'Figurative details'
|
174
|
+
# - script -> 'Script'
|
175
|
+
# - technique -> 'Layout'
|
176
|
+
#
|
177
|
+
# # Text note types
|
178
|
+
#
|
179
|
+
# Accounted for
|
180
|
+
# - admin - acknowledgments
|
181
|
+
#
|
182
|
+
# - condition -> 'Status of text'
|
183
|
+
# - content -> handled as Text Incipit
|
184
|
+
# - untyped -> 'Text note'
|
185
|
+
#
|
186
|
+
# # Page note types
|
187
|
+
#
|
188
|
+
# Accounted for
|
189
|
+
# None
|
190
|
+
#
|
191
|
+
# - content -> Folio Incipit
|
192
|
+
# - date -- skip
|
193
|
+
# - untyped -> 'Folio note'
|
194
|
+
#
|
195
|
+
def note_by_type node, note_type, tag: nil
|
196
|
+
if note_type == :none
|
197
|
+
xpath = %q{mods:mods/mods:note[not(@type)]/text()}
|
198
|
+
else
|
199
|
+
xpath = %Q{mods:mods/mods:note[@type = '#{note_type}']/text()}
|
200
|
+
end
|
201
|
+
|
202
|
+
node.xpath(xpath).map { |x|
|
203
|
+
tag.nil? ? x.text : "#{tag}: #{x.text}"
|
204
|
+
}
|
205
|
+
end
|
206
|
+
|
207
|
+
# Extracts the extent from the given node.
|
208
|
+
#
|
209
|
+
# @param [Nokogiri::XML::Node] node the XML node to extract extent from
|
210
|
+
# @return [String] the extracted extent
|
211
|
+
def extract_extent node
|
212
|
+
xpath = 'mods:mods/mods:physicalDescription/mods:extent'
|
213
|
+
node.xpath(xpath).flat_map { |extent|
|
214
|
+
extent.text.split(%r{;;}).first
|
215
|
+
}.join ', '
|
216
|
+
end
|
217
|
+
|
218
|
+
# Extracts the material as recorded from the given record.
|
219
|
+
#
|
220
|
+
# @param [CSV::Row] record the record to extract material from
|
221
|
+
# @return [String] the extracted material as recorded
|
222
|
+
def extract_material_as_recorded record
|
223
|
+
extract_materials(record).map(&:as_recorded).join '|'
|
224
|
+
end
|
225
|
+
|
226
|
+
# Extracts materials from the given record.
|
227
|
+
#
|
228
|
+
# @param [Object] record the record to extract materials from
|
229
|
+
# @return [Array<DS::Extractor::Material>] an array of Material objects
|
230
|
+
def extract_materials record
|
231
|
+
find_parts(record).flat_map { |part|
|
232
|
+
physdesc_note part, 'support'
|
233
|
+
}.map { |s|
|
234
|
+
s.downcase.chomp('.').strip
|
235
|
+
}.uniq.map { |as_recorded|
|
236
|
+
DS::Extractor::Material.new as_recorded: as_recorded
|
237
|
+
}
|
238
|
+
end
|
239
|
+
|
240
|
+
# Extracts former owners as recorded from the given XML.
|
241
|
+
#
|
242
|
+
# @param [Nokogiri::XML::NodeSet] xml the parsed XML to extract former owners from
|
243
|
+
# @param [Boolean] lookup_split whether to lookup split information or not
|
244
|
+
# @return [Array<String>] the extracted former owners as recorded
|
245
|
+
def extract_former_owners_as_recorded xml, lookup_split: true
|
246
|
+
extract_former_owners(xml).map &:as_recorded
|
247
|
+
end
|
248
|
+
|
249
|
+
# Extracts former owners from the given record.
|
250
|
+
#
|
251
|
+
# @param [Nokogiri::XML::Node] record the XML node representing the record
|
252
|
+
# @return [Array<DS::Extractor::Name>] an array of extracted former owners
|
253
|
+
def extract_former_owners record
|
254
|
+
xpath = "./descendant::mods:note[@type='ownership']/text()"
|
255
|
+
notes = clean_notes(record.xpath(xpath).flat_map(&:text))
|
256
|
+
|
257
|
+
notes.flat_map { |n|
|
258
|
+
splits = Recon::Type::Splits._lookup_single(n, from_column: 'authorized_label')
|
259
|
+
splits.present? ? splits.split('|') : n
|
260
|
+
}.map { |n|
|
261
|
+
DS::Extractor::Name.new as_recorded: DS.mark_long(n), role: 'former owner'
|
262
|
+
}
|
263
|
+
end
|
264
|
+
|
265
|
+
# Extracts authors from the given record.
|
266
|
+
#
|
267
|
+
# @param [Object] record the record to extract authors from
|
268
|
+
# @return [Array<DS::Extractor::Name>] an array of extracted authors
|
269
|
+
def extract_authors record
|
270
|
+
DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ author [author] }
|
271
|
+
end
|
272
|
+
|
273
|
+
# Extracts authors as recorded from the given record.
|
274
|
+
#
|
275
|
+
# @param [Object] record the record to extract authors from
|
276
|
+
# @return [Array<String>] the extracted authors as recorded
|
277
|
+
def extract_authors_as_recorded record
|
278
|
+
extract_authors(record).map &:as_recorded
|
279
|
+
end
|
280
|
+
|
281
|
+
# Extracts artists as recorded from the given record.
|
282
|
+
#
|
283
|
+
# @param [Object] record the record to extract artists
|
284
|
+
def extract_artists_as_recorded record
|
285
|
+
extract_artists(record).map &:as_recorded
|
286
|
+
end
|
287
|
+
|
288
|
+
# Extracts artists from the given record using the specified type and role.
|
289
|
+
#
|
290
|
+
# @param [Object] record the record to extract artists from
|
291
|
+
# @return [Array<DS::Extractor::Name>] an array of extracted artists
|
292
|
+
def extract_artists record
|
293
|
+
DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ artist [artist] illuminator }
|
294
|
+
end
|
295
|
+
|
296
|
+
# Extracts scribes as recorded from the given record.
|
297
|
+
#
|
298
|
+
# @param [Object] record the record to extract scribes from
|
299
|
+
# @return [Array<String>] the extracted scribes as recorded
|
300
|
+
def extract_scribes_as_recorded record
|
301
|
+
extract_scribes(record).map &:as_recorded
|
302
|
+
end
|
303
|
+
|
304
|
+
# Extract scribes from the given record.
|
305
|
+
#
|
306
|
+
# @param record [Object] the record to extract scribes from
|
307
|
+
# @return [Array<String>] the extracted scribes
|
308
|
+
def extract_scribes record
|
309
|
+
DS::Extractor::DsMetsXmlExtractor.extract_name record, *%w{ scribe [scribe] }
|
310
|
+
end
|
311
|
+
|
312
|
+
# Extract other names as recorded from the given record.
|
313
|
+
#
|
314
|
+
# @param record [Object] the record to extract other names from
|
315
|
+
# @return [Array<String>] the extracted other names as recorded
|
316
|
+
def extract_other_names_as_recorded record
|
317
|
+
extract_associated_agents(record).map &:as_recorded
|
318
|
+
end
|
319
|
+
|
320
|
+
# Extract other names from the given record.
|
321
|
+
#
|
322
|
+
# @param record [Object] the record to extract other names from
|
323
|
+
# @return [Array<String>] the extracted other names
|
324
|
+
def extract_associated_agents record
|
325
|
+
DS::Extractor::DsMetsXmlExtractor.extract_name record, 'other'
|
326
|
+
end
|
327
|
+
|
328
|
+
##
|
329
|
+
# Return a list of unique languages from the text-level <mods:note>s
|
330
|
+
# that start with "lang:" (case -insensitive), joined with separator;
|
331
|
+
# so, "Latin", rather than "Latin|Latin|Latin", etc.
|
332
|
+
#
|
333
|
+
# @return [String]
|
334
|
+
def extract_languages_as_recorded record
|
335
|
+
extract_languages(record).map &:as_recorded
|
336
|
+
end
|
337
|
+
|
338
|
+
# Extract languages from the given record.
|
339
|
+
#
|
340
|
+
# @param record [Object] the record to extract languages from
|
341
|
+
# @return [Array<DS::Extractor::Language>] the extracted languages
|
342
|
+
def extract_languages record
|
343
|
+
# /mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/mods:mods/mods:note
|
344
|
+
# Can be Lang: or lang: or ???, so down case the text with translate()
|
345
|
+
xpath = './descendant::mods:note[starts-with(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "lang:")]'
|
346
|
+
find_texts(record).flat_map { |text|
|
347
|
+
text.xpath(xpath).map { |note| note.text.sub(%r{^lang:\s*}i, '') }
|
348
|
+
}.uniq.map { |as_recorded|
|
349
|
+
DS::Extractor::Language.new as_recorded: as_recorded
|
350
|
+
}
|
351
|
+
end
|
352
|
+
|
353
|
+
# Extract name from the given node based on the provided roles.
|
354
|
+
#
|
355
|
+
# @param node [Object] the node to extract name from
|
356
|
+
# @param roles [Array<String>] the roles to search for
|
357
|
+
# @return [Array<DS::Extractor::Name>] the extracted names
|
358
|
+
def extract_name node, *roles
|
359
|
+
# Roles have different cases: Author, author, etc.
|
360
|
+
# Xpath 1.0 has no lower-case function, so use translate()
|
361
|
+
translate = "translate(./mods:role/mods:roleTerm/text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
|
362
|
+
props = roles.map { |r| "#{translate} = '#{r}'" }.join ' or '
|
363
|
+
xpath = "./descendant::mods:name[#{props}]"
|
364
|
+
node.xpath(xpath).flat_map { |name|
|
365
|
+
name.xpath('mods:namePart').text.split %r{\s*;\s*}
|
366
|
+
}.uniq.map { |as_recorded|
|
367
|
+
DS::Extractor::Name.new as_recorded: as_recorded, role: roles.first
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
# Extract titles as recorded from the given record.
|
372
|
+
#
|
373
|
+
# @param record [Object] the record to extract titles from
|
374
|
+
# @return [Array<String>] the extracted titles as recorded
|
375
|
+
def extract_titles_as_recorded record
|
376
|
+
extract_titles(record).map &:as_recorded
|
377
|
+
end
|
378
|
+
|
379
|
+
# Extract titles from the given record.
|
380
|
+
#
|
381
|
+
# @param record [Object] the record to extract titles from
|
382
|
+
# @return [Array<DS::Extractor::Title>] the extracted titles
|
383
|
+
def extract_titles record
|
384
|
+
xpath = 'mods:mods/mods:titleInfo/mods:title'
|
385
|
+
find_texts(record).flat_map { |text|
|
386
|
+
text.xpath(xpath).map(&:text)
|
387
|
+
}.reject {
|
388
|
+
|t| t == '[Title not supplied]'
|
389
|
+
}.map { |as_recorded|
|
390
|
+
DS::Extractor::Title.new as_recorded: as_recorded
|
391
|
+
}
|
392
|
+
end
|
393
|
+
|
394
|
+
# Extract production places as recorded from the given XML.
|
395
|
+
#
|
396
|
+
# @param xml [Object] the XML to extract production places from
|
397
|
+
# @return [Array<String>] the extracted production places as recorded
|
398
|
+
def extract_production_places_as_recorded xml
|
399
|
+
extract_places(xml).map &:as_recorded
|
400
|
+
end
|
401
|
+
|
402
|
+
##
|
403
|
+
# Extract the places of production for reconciliation CSV output.
|
404
|
+
#
|
405
|
+
# Returns a two-dimensional array, each row is a place; and each row has
|
406
|
+
# one column: place name; for example:
|
407
|
+
#
|
408
|
+
# [["Austria"],
|
409
|
+
# ["Germany"],
|
410
|
+
# ["France (?)"]]
|
411
|
+
#
|
412
|
+
# @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
|
413
|
+
# @return [Array<Array>] an array of arrays of values
|
414
|
+
def extract_recon_places xml
|
415
|
+
extract_places(xml).map &:to_a
|
416
|
+
end
|
417
|
+
|
418
|
+
# Extract reconciliation titles from the given XML.
|
419
|
+
#
|
420
|
+
# @param xml [Nokogiri::XML::Node] a +<METS_XML>+ node
|
421
|
+
# @return [Array<String>] an array of titles for reconciliation
|
422
|
+
def extract_recon_titles xml
|
423
|
+
extract_titles(xml).to_a
|
424
|
+
end
|
425
|
+
|
426
|
+
# Extract reconciliation names from the given XML.
|
427
|
+
#
|
428
|
+
# @param xml [Nokogiri::XML::Node] a +<METS_XML>+ node
|
429
|
+
# @return [Array<Array>] an array of arrays of names for reconciliation
|
430
|
+
def extract_recon_names xml
|
431
|
+
data = extract_authors(xml).map &:to_a
|
432
|
+
data += extract_artists(xml).map &:to_a
|
433
|
+
data += extract_scribes(xml).map &:to_a
|
434
|
+
data += extract_former_owners(xml).map &:to_a
|
435
|
+
data += extract_associated_agents(xml).map &:to_a
|
436
|
+
data
|
437
|
+
end
|
438
|
+
|
439
|
+
##
|
440
|
+
# Extract acknowledgments, notes, physical descriptions, and
|
441
|
+
# former owners; return all strings that start with SPLIT:,
|
442
|
+
# remove 'SPLIT: ' and return an array of arrays that can
|
443
|
+
# be treated as rows by Recon::Type::Splits
|
444
|
+
def extract_recon_splits xml
|
445
|
+
data = []
|
446
|
+
data += DS::Extractor::DsMetsXmlExtractor.extract_former_owners_as_recorded xml, lookup_split: false
|
447
|
+
data.flatten.select { |d| d.to_s.size >= 400 }.map { |d| [d.strip] }
|
448
|
+
end
|
449
|
+
|
450
|
+
##
|
451
|
+
# For the legacy DS METS, this value is the value of
|
452
|
+
# +mods:identifier[@type="local"]+ is the shelf mark. If there are other
|
453
|
+
# ID types, we can't distinguish them from shelfmarks.
|
454
|
+
#
|
455
|
+
# @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
|
456
|
+
# @return [String] the shelfmark
|
457
|
+
def extract_shelfmark xml
|
458
|
+
ms = find_ms xml
|
459
|
+
ms.xpath('mods:mods/mods:identifier[@type="local"]/text()').text
|
460
|
+
end
|
461
|
+
|
462
|
+
def extract_genres xml
|
463
|
+
[]
|
464
|
+
end
|
465
|
+
|
466
|
+
##
|
467
|
+
# See the note for [Recon::Type::Subjects]: Each source subject extraction
|
468
|
+
# method should return a two dimensional array:
|
469
|
+
#
|
470
|
+
# [["Islamic law--Early works to 1800", ""],
|
471
|
+
# ["Malikites--Early works to 1800", ""],
|
472
|
+
# ["Islamic law", ""],s
|
473
|
+
# ["Malikites", ""],
|
474
|
+
# ["Arabic language--Grammar--Early works to 1800", ""],
|
475
|
+
# ["Arabic language--Grammar", ""],
|
476
|
+
# ...
|
477
|
+
# ]
|
478
|
+
#
|
479
|
+
# The second value is for those cases where the source provides an
|
480
|
+
# authority URI. The METS records don't give a URI so this method always
|
481
|
+
# returns the empty string for the second value.
|
482
|
+
#
|
483
|
+
# @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
|
484
|
+
# @return [Array<String,String>] a two-dimenional array of subject and URI
|
485
|
+
def extract_recon_subjects xml
|
486
|
+
extract_subjects(xml).map &:to_a
|
487
|
+
end
|
488
|
+
|
489
|
+
##
|
490
|
+
# Extract subjects, the `mods:originInfo/mods:edition` values for each
|
491
|
+
# text. For example,
|
492
|
+
#
|
493
|
+
# <mods:originInfo>
|
494
|
+
# <mods:edition>Alexander, de Villa Dei.</mods:edition>
|
495
|
+
# <mods:edition>Latin language--Grammar.</mods:edition>
|
496
|
+
# <mods:edition>Latin poetry, Medieval and modern.</mods:edition>
|
497
|
+
# <mods:edition>Manuscripts, Medieval--Connecticut--New Haven.</mods:edition>
|
498
|
+
# </mods:originInfo>
|
499
|
+
#
|
500
|
+
# @param [Nokogiri::XML:Node] xml a +<METS_XML>+ node
|
501
|
+
# @return [Array<String>] an of subjects
|
502
|
+
def extract_subjects_as_recorded xml
|
503
|
+
extract_subjects(xml).map(&:as_recorded)
|
504
|
+
end
|
505
|
+
|
506
|
+
# Extract all subjects as recorded from the given XML.
|
507
|
+
#
|
508
|
+
# @param xml [Nokogiri::XML::Node] the XML to extract subjects from
|
509
|
+
# @return [Array<String>] the extracted subjects as recorded
|
510
|
+
def extract_all_subjects_as_recorded xml
|
511
|
+
extract_subjects_as_recorded xml
|
512
|
+
end
|
513
|
+
|
514
|
+
# Extract link to institution record from the given XML.
|
515
|
+
#
|
516
|
+
# @param xml [Nokogiri::XML::Node] the XML to extract the link from
|
517
|
+
# @return [String] the extracted link to the institution record
|
518
|
+
def extract_link_to_inst_record xml
|
519
|
+
ms = find_ms xml
|
520
|
+
# xpath mods:mods/mods:relatedItem/mods:location/mods:url
|
521
|
+
xpath = "mods:mods/mods:relatedItem/mods:location/mods:url"
|
522
|
+
ms.xpath(xpath).map(&:text).join '|'
|
523
|
+
end
|
524
|
+
|
525
|
+
# Determines if the XML document is dated by a scribe.
|
526
|
+
#
|
527
|
+
# @param [Nokogiri::XML:Node] xml the XML document to check
|
528
|
+
# @return [Boolean] true if the document is dated by a scribe, false otherwise
|
529
|
+
def dated_by_scribe? xml
|
530
|
+
parts = find_parts xml
|
531
|
+
# mods:mods/mods:note
|
532
|
+
xpath = 'mods:mods/mods:note[@type="date"]'
|
533
|
+
parts.any? { |part|
|
534
|
+
part.xpath(xpath).text.upcase == 'Y'
|
535
|
+
}
|
536
|
+
end
|
537
|
+
|
538
|
+
##
|
539
|
+
# Return as a single string all the date values for the manuscript. This
|
540
|
+
# is a concatenation of the values returned by DS10.extract_date_created,
|
541
|
+
# DS10.extract_assigned_date, DS10.extract_date_range.
|
542
|
+
#
|
543
|
+
# @param [Nokogiri::XML:Node] xml the parsed METS xml document
|
544
|
+
# @return [Array<String>] the concatenated date values
|
545
|
+
def extract_production_date_as_recorded xml
|
546
|
+
find_parts(xml).map { |part|
|
547
|
+
date_created = extract_date_created part
|
548
|
+
assigned = extract_assigned_date part
|
549
|
+
range = extract_date_range_for_part(part).join '-'
|
550
|
+
[date_created, assigned, range].uniq.reject(&:empty?).join '; '
|
551
|
+
}.reject { |date| date.to_s.strip.empty? }
|
552
|
+
end
|
553
|
+
|
554
|
+
##
|
555
|
+
# Extract ranges from `mods:dateCreated` elements where a @point is
|
556
|
+
# defined, thus:
|
557
|
+
#
|
558
|
+
# <mods:dateCreated point="start" encoding="iso8601">1300</mods:dateCreated>
|
559
|
+
# <mods:dateCreated point="end" encoding="iso8601">1399</mods:dateCreated>
|
560
|
+
#
|
561
|
+
# @param [Nokogiri::XML:Node] part a part-level node
|
562
|
+
# @return [Array<String>] the start and end dates as an array of integers
|
563
|
+
def extract_date_range xml, range_sep:
|
564
|
+
find_parts(xml).map { |part|
|
565
|
+
extract_date_range_for_part(part).join range_sep
|
566
|
+
}
|
567
|
+
end
|
568
|
+
|
569
|
+
DATE_START_XPATH = 'mods:mods/mods:originInfo/mods:dateCreated[@point="start"]'
|
570
|
+
DATE_END_XPATH = 'mods:mods/mods:originInfo/mods:dateCreated[@point="end"]'
|
571
|
+
|
572
|
+
##
|
573
|
+
# Extract ranges from `mods:dateCreated` elements where a @point is
|
574
|
+
# start and end
|
575
|
+
#
|
576
|
+
# @param [Nokogiri::XML:Node] part a part-level node
|
577
|
+
# @return [Array<Integer>] the start and end dates as an array of integers
|
578
|
+
def extract_date_range_for_part part
|
579
|
+
start_date = part.xpath(DATE_START_XPATH).text
|
580
|
+
end_date = part.xpath(DATE_END_XPATH).text
|
581
|
+
[start_date, end_date].reject(&:empty?).map(&:to_i)
|
582
|
+
end
|
583
|
+
|
584
|
+
##
|
585
|
+
# Return any date not found in the `otherDate` or in a dateCreated date
|
586
|
+
# range (see #extract_date_range); thus:
|
587
|
+
#
|
588
|
+
# <mods:dateCreated>1537</mods:dateCreated>
|
589
|
+
# <mods:dateCreated>1531</mods:dateCreated>
|
590
|
+
# <mods:dateCreated>14??, October 21</mods:dateCreated>
|
591
|
+
# <mods:dateCreated>1462, July 23</mods:dateCreated>
|
592
|
+
# <mods:dateCreated>1549, November</mods:dateCreated>
|
593
|
+
#
|
594
|
+
# These values commonly give the date for "dated" manuscripts
|
595
|
+
#
|
596
|
+
# @param [Nokogiri::XML:Node] part a part-level node
|
597
|
+
# @return [Array<Integer>] the content of any dateCreated without '@point'
|
598
|
+
# defined
|
599
|
+
def extract_date_created part
|
600
|
+
xpath = 'mods:mods/mods:originInfo/mods:dateCreated[not(@point)]'
|
601
|
+
part.xpath(xpath).map(&:text).join ', '
|
602
|
+
end
|
603
|
+
|
604
|
+
##
|
605
|
+
# Return dates found in the `otherDate` element, reformatting them as
|
606
|
+
# needed. These examples are taken from several METS files.
|
607
|
+
#
|
608
|
+
# <mods:dateOther>[ca. 1410]</mods:dateOther>
|
609
|
+
# <mods:dateOther>[between 1100 and 1200]</mods:dateOther>
|
610
|
+
# <mods:dateOther>[between 1450 and 1460]</mods:dateOther>
|
611
|
+
# <mods:dateOther>[between 1450 and 1500]</mods:dateOther>
|
612
|
+
# <mods:dateOther>s. XV#^3/4#</mods:dateOther>
|
613
|
+
# <mods:dateOther>s. XV</mods:dateOther>
|
614
|
+
# <mods:dateOther>s. XVI#^4/4#</mods:dateOther>
|
615
|
+
# <mods:dateOther>s. XVIII#^2/4#</mods:dateOther>
|
616
|
+
# <mods:dateOther>s. XV#^in#</mods:dateOther>
|
617
|
+
#
|
618
|
+
# Most dateOther values have the format:
|
619
|
+
#
|
620
|
+
# s. XVII#^2#
|
621
|
+
#
|
622
|
+
# The notation #^<VAL># encodes a portion of the string that was presented
|
623
|
+
# as superscript on the Berkeley DS site. DS 2.0 doesn't use the
|
624
|
+
# superscripts; thus, when it occurs, this portion of the string is
|
625
|
+
# reformatted `(<VAL>)`:
|
626
|
+
#
|
627
|
+
# s. XVII#^2# => s. XVII(2)
|
628
|
+
# s. XV#^ex# => s. XV(ex)
|
629
|
+
# s. XVI#^in# => s. XVI(in)
|
630
|
+
# s. X#^med# => s. X(med)
|
631
|
+
# s. XII#^med# => s. XII(med)
|
632
|
+
#
|
633
|
+
# @param [Nokogiri::XML:Node] part a part-level node
|
634
|
+
# @return [Array<Integer>] the date string reformatted as described above
|
635
|
+
def extract_assigned_date part
|
636
|
+
xpath = 'mods:mods/mods:originInfo/mods:dateOther'
|
637
|
+
part.xpath(xpath).text.gsub %r{#\^?([\w/]+)(\^|#)}, '(\1)'
|
638
|
+
end
|
639
|
+
|
640
|
+
|
641
|
+
# Extracts acknowledgments from the given XML document.
|
642
|
+
#
|
643
|
+
# @param [Nokogiri::XML::Node] xml the XML document to extract acknowledgments from
|
644
|
+
# @return [Array<String>] the extracted acknowledgments
|
645
|
+
def extract_acknowledgments xml
|
646
|
+
notes = []
|
647
|
+
notes += find_ms(xml).flat_map { |ms| note_by_type ms, 'admin' }
|
648
|
+
|
649
|
+
notes += find_parts(xml).flat_map { |part|
|
650
|
+
extent = extract_extent part
|
651
|
+
note_by_type part, 'admin', tag: extent
|
652
|
+
}
|
653
|
+
|
654
|
+
notes += find_texts(xml).flat_map { |text|
|
655
|
+
extent = extract_extent text
|
656
|
+
note_by_type text, 'admin', tag: extent
|
657
|
+
}
|
658
|
+
|
659
|
+
notes += find_pages(xml).flat_map { |page|
|
660
|
+
extent = extract_extent page
|
661
|
+
note_by_type page, 'admin', tag: extent
|
662
|
+
}
|
663
|
+
|
664
|
+
clean_notes notes
|
665
|
+
end
|
666
|
+
|
667
|
+
##
|
668
|
+
# Extract the filename for page. This will be either:
|
669
|
+
#
|
670
|
+
# * the values for +mods:identifier+ with +@type='filename'+; or
|
671
|
+
#
|
672
|
+
# * the filenames pointed to by the linked +mets:fptr+ in the
|
673
|
+
# +mets:fileGrp+ with +@USE='image/master'+
|
674
|
+
#
|
675
|
+
# * an array containing +['NO_FILE']+, if no files are associated with
|
676
|
+
# the page
|
677
|
+
#
|
678
|
+
# There will almost always be one file, but at least one manuscript has
|
679
|
+
# page with two associated images. Thus, we return an array.
|
680
|
+
#
|
681
|
+
# @param [Nokogiri::XML::Node] page the +mets:dmdSec+ node for the page
|
682
|
+
# @return [Array<String>] array of all the filenames for +page+
|
683
|
+
def extract_filenames page
|
684
|
+
# mods:mods/mods:identifier[@type='filename']
|
685
|
+
xpath = 'mods:mods/mods:identifier[@type="filename"]'
|
686
|
+
filenames = page.xpath(xpath).map(&:text)
|
687
|
+
return filenames unless filenames.empty?
|
688
|
+
|
689
|
+
# no filename; find the ARK URL for the master image for this page
|
690
|
+
extract_master_mets_file page
|
691
|
+
end
|
692
|
+
|
693
|
+
# Extracts the folio number from the given page node.
|
694
|
+
#
|
695
|
+
# @param [Nokogiri::XML::Node] page the XML node representing the page
|
696
|
+
# @return [String] the extracted folio number
|
697
|
+
def extract_folio_num page
|
698
|
+
# mods:mods/mods:physicalDescription/mods:extent
|
699
|
+
xpath = 'mods:mods/mods:physicalDescription/mods:extent'
|
700
|
+
page.xpath(xpath).map(&:text).join '|'
|
701
|
+
end
|
702
|
+
|
703
|
+
##
|
704
|
+
# In some METS files each page has a list of mets:fptr elements, we need
|
705
|
+
# to get the @FILEID for the master image, but we don't know which one is
|
706
|
+
# for the master. Thus we get all the @FILEIDs.
|
707
|
+
#
|
708
|
+
# <mets:structMap>
|
709
|
+
# <mets:div TYPE="text" LABEL="[No Title for Display]" ADMID="RMD1" DMDID="DM1">
|
710
|
+
# <mets:div TYPE="item" LABEL="[No Title for Display]" DMDID="DM2">
|
711
|
+
# <mets:div TYPE="item" LABEL="[No Title for Display]" DMDID="DM3">
|
712
|
+
# <mets:div TYPE="item" LABEL="Music extending into right margin, upper right column." DMDID="DM4">
|
713
|
+
# <mets:fptr FILEID="FID1"/>
|
714
|
+
# <mets:fptr FILEID="FID3"/>
|
715
|
+
# <mets:fptr FILEID="FID5"/>
|
716
|
+
# <mets:fptr FILEID="FID7"/>
|
717
|
+
# <mets:fptr FILEID="FID9"/>
|
718
|
+
# </mets:div>
|
719
|
+
# <!-- snip -->
|
720
|
+
# </mets:div>
|
721
|
+
# </mets:div>
|
722
|
+
# </mets:div>
|
723
|
+
# </mets:structMap>
|
724
|
+
#
|
725
|
+
# Using the FILEIDs, find the corresponding mets:file in the
|
726
|
+
# mets:fileGrp with @USE='image/master'.
|
727
|
+
#
|
728
|
+
# <mets:fileGrp USE="image/master">
|
729
|
+
# <mets:file ID="FID1" MIMETYPE="image/tiff" SEQ="1" CREATED="2010-11-08T10:26:20.3" ADMID="ADM1 ADM4" GROUPID="GID1">
|
730
|
+
# <mets:FLocat xlink:href="http://nma.berkeley.edu/ark:/28722/bk0008v1k7q" LOCTYPE="URL"/>
|
731
|
+
# </mets:file>
|
732
|
+
# <mets:file ID="FID2" MIMETYPE="image/tiff" SEQ="2" CREATED="2010-11-08T10:26:20.393" ADMID="ADM1 ADM5" GROUPID="GID2">
|
733
|
+
# <mets:FLocat xlink:href="http://nma.berkeley.edu/ark:/28722/bk0008v1k88" LOCTYPE="URL"/>
|
734
|
+
# </mets:file>
|
735
|
+
# </mets:fileGrp>
|
736
|
+
#
|
737
|
+
# We then follow the +xlink:href+ to get the filename from the 'location'
|
738
|
+
# HTTP header.
|
739
|
+
#
|
740
|
+
# @param [Nokogiri::XML::Node] page the +mets:dmdSec+ node for the page
|
741
|
+
# @return [Array<String>] array of all the filenames for +page+
|
742
|
+
def extract_master_mets_file page
|
743
|
+
dmdid = page['ID']
|
744
|
+
# all the mets:fptr @FILEIDs for this page
|
745
|
+
xpath = %Q{//mets:structMap/descendant::mets:div[@DMDID='#{dmdid}']/mets:fptr/@FILEID}
|
746
|
+
|
747
|
+
# create an OR query because we don't know which FILEID is for the
|
748
|
+
# master mets:file:
|
749
|
+
# "@ID = 'FID1' or @ID = 'FID3' or @ID = 'FID5' ... etc."
|
750
|
+
id_query = page.xpath(xpath).map(&:text).map { |id| "@ID='#{id}'" }.join ' or '
|
751
|
+
return ['NO_FILE'] if id_query.strip.empty? # there is no associated mets:fptr
|
752
|
+
|
753
|
+
# the @xlink:href is the Berkeley ARK address; e.g., http://nma.berkeley.edu/ark:/28722/bk0008v1k88
|
754
|
+
xpath = "//mets:fileGrp[@USE='image/master']/mets:file[#{id_query}]/mets:FLocat/@xlink:href"
|
755
|
+
fptr_addresses = page.xpath(xpath).map &:text
|
756
|
+
return ['NO_FILE'] if fptr_addresses.empty? # I don't know if this happens, but just in case...
|
757
|
+
|
758
|
+
# for each ARK address, find the TIFF filename
|
759
|
+
fptr_addresses.map { |address| locate_filename address }
|
760
|
+
end
|
761
|
+
|
762
|
+
# Extracts the manuscript note from the given XML.
|
763
|
+
#
|
764
|
+
# @param [Nokogiri::XML::Node] xml the XML node to extract manuscript note from
|
765
|
+
# @return [Array<String>] an array of manuscript notes
|
766
|
+
def extract_ms_note xml
|
767
|
+
notes = []
|
768
|
+
ms = find_ms xml
|
769
|
+
notes += note_by_type ms, :none, tag: 'Manuscript note'
|
770
|
+
notes += note_by_type ms, 'bibliography', tag: 'Bibliography'
|
771
|
+
notes
|
772
|
+
end
|
773
|
+
|
774
|
+
# Extracts notes for each part in the given XML.
|
775
|
+
#
|
776
|
+
# @param [Nokogiri::XML::Node] xml the XML node to extract notes from
|
777
|
+
# @return [Array<String>] an array of extracted notes
|
778
|
+
def extract_part_note xml
|
779
|
+
find_parts(xml).flat_map { |part|
|
780
|
+
extent = extract_extent part
|
781
|
+
note_by_type part, :none, tag: extent
|
782
|
+
}
|
783
|
+
end
|
784
|
+
|
785
|
+
# Extracts explicit information from the given node based on the provided tag.
|
786
|
+
#
|
787
|
+
# @param [Nokogiri::XML::Node] node the XML node to extract information from
|
788
|
+
# @param [String] tag the tag to prepend to each extracted information
|
789
|
+
# @return [Array<String>] an array of extracted information
|
790
|
+
def extract_explicit node, tag:
|
791
|
+
node.xpath('mods:mods/mods:abstract/text()').map { |n|
|
792
|
+
"#{tag}: #{n.text}"
|
793
|
+
}
|
794
|
+
end
|
795
|
+
|
796
|
+
# Extracts text notes from the given XML document.
|
797
|
+
#
|
798
|
+
# @param [Nokogiri::XML::Node] xml the XML document to extract text notes from
|
799
|
+
# @return [Array<String>] the extracted text notes
|
800
|
+
def extract_text_note xml
|
801
|
+
find_texts(xml).flat_map { |text|
|
802
|
+
extent = extract_extent text
|
803
|
+
notes = []
|
804
|
+
notes += note_by_type text, :none, tag: extent
|
805
|
+
notes += note_by_type text, 'condition', tag: "Status of text, #{extent}"
|
806
|
+
notes += note_by_type text, 'content', tag: "Incipit, #{extent}"
|
807
|
+
notes += extract_explicit text, tag: "Explicit, #{extent}"
|
808
|
+
notes
|
809
|
+
}
|
810
|
+
end
|
811
|
+
|
812
|
+
# Extracts notes for each page in the given XML.
|
813
|
+
#
|
814
|
+
# @param [Nokogiri::XML::Node] xml the XML node to extract notes from
|
815
|
+
# @return [Array<String>] an array of extracted notes
|
816
|
+
def extract_page_note xml
|
817
|
+
find_pages(xml).flat_map { |page|
|
818
|
+
extent = extract_extent page
|
819
|
+
notes = []
|
820
|
+
notes += note_by_type page, :none, tag: extent
|
821
|
+
notes += note_by_type page, 'content', tag: "Incipit, #{extent}"
|
822
|
+
notes += extract_explicit page, tag: "Explicit, #{extent}"
|
823
|
+
notes
|
824
|
+
}
|
825
|
+
end
|
826
|
+
|
827
|
+
##
|
828
|
+
# Extract the notes at all level from the +xml+, and return
|
829
|
+
# an array of strings
|
830
|
+
#
|
831
|
+
# @param [Nokogiri::XML::Node] xml the document's xml
|
832
|
+
# @return [Array<String>] the note values
|
833
|
+
def extract_notes xml
|
834
|
+
notes = []
|
835
|
+
# get all notes that don't have @type
|
836
|
+
xpath = %q{//mods:note[not(@type)]/text()}
|
837
|
+
notes += extract_ms_note xml
|
838
|
+
notes += extract_part_note xml
|
839
|
+
notes += extract_text_note xml
|
840
|
+
notes += extract_docket xml
|
841
|
+
notes += extract_page_note xml
|
842
|
+
|
843
|
+
clean_notes notes
|
844
|
+
end
|
845
|
+
|
846
|
+
##
|
847
|
+
# **If** the +mods:mods+ element has a
|
848
|
+
# <tt><mods:titleInfo type="alternative"></tt> element **and** a
|
849
|
+
# <tt><mods:abstract[not(@displayLabel)]></tt>, **then** the content of
|
850
|
+
# the <tt><mods:abstract[not(@displayLabel)]></tt> is an incipit; XPath:
|
851
|
+
#
|
852
|
+
#
|
853
|
+
# //mods:mods[./mods:titleInfo[@type="alternative"] and ./mods:abstract[not(@displayLabel)]]
|
854
|
+
#
|
855
|
+
# //mods:mods[./mods:titleInfo[@type="alternative"]]/mods:abstract[not(@displayLabel)]/text()
|
856
|
+
#
|
857
|
+
#
|
858
|
+
# **If** the `mods:mods` element has a `mods:titleInfo type="alternative"` element **and** a `<mods:note type="content">`, **then** the content of the `<mods:note type="content">` is an explicit; XPath:
|
859
|
+
#
|
860
|
+
# //mods:mods[./mods:titleInfo[@type="alternative"] and ./mods:note[@type="content"]]
|
861
|
+
#
|
862
|
+
# //mods:mods[./mods:titleInfo[@type="alternative"]]/mods:note[@type="content"]/text()
|
863
|
+
#
|
864
|
+
def extract_incipit_explicit xml
|
865
|
+
# ./descendant::mods:physicalDescription
|
866
|
+
# mods:mods/mods:originInfo/mods:place/mods:placeTerm
|
867
|
+
# find any mod:mods containing an incipit or explicit
|
868
|
+
xpath = %q{//mods:mods[./mods:titleInfo[@type="alternative"] and
|
869
|
+
(./mods:abstract[not(@displayLabel)] or
|
870
|
+
./mods:note[@type="content"])]}
|
871
|
+
|
872
|
+
find_texts(xml).flat_map { |node|
|
873
|
+
# return an array for formatted incipits and explicits for this manuscript
|
874
|
+
extent = node.xpath('./descendant::mods:physicalDescription/mods:extent/text()', NS).text
|
875
|
+
node.xpath('./descendant::mods:abstract[not(@displayLabel)]/text()').map { |inc|
|
876
|
+
"Incipit, #{extent}: #{inc}"
|
877
|
+
} + node.xpath('./descendant::mods:note[@type="content"]/text()').map { |exp|
|
878
|
+
"Explicit, #{extent}: #{exp}"
|
879
|
+
}
|
880
|
+
}
|
881
|
+
end
|
882
|
+
|
883
|
+
##
|
884
|
+
# DS METS can have +mods:abstract+ elments with +@displayLabel="docket"+.
|
885
|
+
# Extract these values and return as an array.
|
886
|
+
#
|
887
|
+
# @param [Nokogiri::XML::Node] xml the document xml
|
888
|
+
# @return [Array<String>] the note values
|
889
|
+
def extract_docket xml
|
890
|
+
xpath = %q{//mods:abstract[@displayLabel = 'docket']/text()}
|
891
|
+
xml.xpath(xpath, NS).map { |docket|
|
892
|
+
"Docket: #{docket.text}"
|
893
|
+
}
|
894
|
+
end
|
895
|
+
|
896
|
+
###
|
897
|
+
# Recon extractor
|
898
|
+
###
|
899
|
+
|
900
|
+
# Extracts places from the given record.
|
901
|
+
#
|
902
|
+
# @param [Object] record the record to extract places from
|
903
|
+
# @return [Array<DS::Extractor::Place>] the extracted places
|
904
|
+
def extract_places record
|
905
|
+
parts = find_parts record
|
906
|
+
xpath = 'mods:mods/mods:originInfo/mods:place/mods:placeTerm'
|
907
|
+
parts.flat_map { |node|
|
908
|
+
node.xpath(xpath).map { |place|
|
909
|
+
DS::Extractor::Place.new as_recorded: place.text.split(%r{;;}).join(', ')
|
910
|
+
}
|
911
|
+
}
|
912
|
+
end
|
913
|
+
|
914
|
+
# Extracts all subjects from the given record.
|
915
|
+
#
|
916
|
+
# @note method returns {#extract_subjects} to fulfill
|
917
|
+
# DS::Extractor contract
|
918
|
+
#
|
919
|
+
# @param [Object] record the record to extract subjects from
|
920
|
+
# @return [Array<DS::Extractor::Subject>] the extracted subjects
|
921
|
+
def extract_all_subjects record
|
922
|
+
extract_subjects record
|
923
|
+
end
|
924
|
+
|
925
|
+
# Extracts subjects from the given record.
|
926
|
+
#
|
927
|
+
# @param [Object] record the record to extract subjects from
|
928
|
+
# @return [Array<DS::Extractor::Subject>] the extracted subjects
|
929
|
+
def extract_subjects record
|
930
|
+
xpath = '//mods:originInfo/mods:edition'
|
931
|
+
find_texts(record).flat_map { |text|
|
932
|
+
text.xpath(xpath).map { |subj|
|
933
|
+
as_recorded = subj.text.strip.gsub(/\s+/, ' ')
|
934
|
+
DS::Extractor::Subject.new as_recorded: as_recorded, vocab: 'ds-subject'
|
935
|
+
}
|
936
|
+
}
|
937
|
+
end
|
938
|
+
|
939
|
+
###
|
940
|
+
# METS structMap extraction
|
941
|
+
#
|
942
|
+
# Extract mods:mods elements by catalog description level:
|
943
|
+
# manuscript, manuscript part, text, page, image
|
944
|
+
###
|
945
|
+
|
946
|
+
def find_ms xml
|
947
|
+
# the manuscript is one div deep in the structMap
|
948
|
+
# /mets:mets/mets:structMap/mets:div/@DMDID
|
949
|
+
xpath = '/mets:mets/mets:structMap/mets:div/@DMDID'
|
950
|
+
id = xml.xpath(xpath).first.text
|
951
|
+
xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
|
952
|
+
end
|
953
|
+
|
954
|
+
# Find the manuscript parts in the XML document.
|
955
|
+
#
|
956
|
+
# @param [Nokogiri::XML::Node] xml the parsed XML document
|
957
|
+
# @return [Array<Nokogiri::XML::Node>] an array of manuscript parts in the correct order
|
958
|
+
def find_parts xml
|
959
|
+
# /mets:mets/mets:structMap/mets:div/mets:div/@DMDID
|
960
|
+
# manuscripts parts are two divs deep in the structMap
|
961
|
+
# We need to get the IDs in order
|
962
|
+
xpath = '/mets:mets/mets:structMap/mets:div/mets:div/@DMDID'
|
963
|
+
ids = xml.xpath(xpath).map &:text
|
964
|
+
# We can't count on the order or the numbering of the mets:dmdSec
|
965
|
+
# elements outside of the structMap. Thus, we have to return an
|
966
|
+
# array with the parts mets:dmdSec in the correct order.
|
967
|
+
ids.map { |id|
|
968
|
+
xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
|
969
|
+
}
|
970
|
+
end
|
971
|
+
|
972
|
+
|
973
|
+
# Find the texts in the XML document.
|
974
|
+
#
|
975
|
+
# @param [Nokogiri::XML::Node] xml the parsed XML document
|
976
|
+
# @return [Array<Nokogiri::XML::Node>] an array of text nodes in the correct order
|
977
|
+
def find_texts xml
|
978
|
+
# /mets:mets/mets:structMap/mets:div/mets:div/mets:div/@DMDID
|
979
|
+
# texts are three divs deep in the structMap
|
980
|
+
# We need to get the IDs in order
|
981
|
+
xpath = '/mets:mets/mets:structMap/mets:div/mets:div/mets:div/@DMDID'
|
982
|
+
ids = xml.xpath(xpath).map &:text
|
983
|
+
ids.map { |id|
|
984
|
+
xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
|
985
|
+
}
|
986
|
+
end
|
987
|
+
|
988
|
+
##
|
989
|
+
# @param [Nokogiri::XML::Node] xml parsed XML of the METS document
|
990
|
+
# @return [Arry<Nokogiri::XML::Node>] array of the page-level +mets:dmdSec+
|
991
|
+
# nodes
|
992
|
+
def find_pages xml
|
993
|
+
# /mets:mets/mets:structMap/mets:div/mets:div/mets:div/mets:div/@DMDID
|
994
|
+
# the pages are four divs deep in the structMap
|
995
|
+
# We need the IDs in order
|
996
|
+
xpath = '/mets:mets/mets:structMap/mets:div/mets:div/mets:div/mets:div/@DMDID'
|
997
|
+
ids = xml.xpath(xpath).map &:text
|
998
|
+
# collect dmdSec's for all the page IDs
|
999
|
+
ids.flat_map { |id|
|
1000
|
+
xml.xpath "/mets:mets/mets:dmdSec[@ID='#{id}']/mets:mdWrap/mets:xmlData"
|
1001
|
+
}
|
1002
|
+
end
|
1003
|
+
|
1004
|
+
# A method to return the date when the source was last modified.
|
1005
|
+
# For DS METS we have chosen the date 2021-10-01.
|
1006
|
+
# @return [String] "2021-10-01"
|
1007
|
+
def source_modified
|
1008
|
+
"2021-10-01"
|
1009
|
+
end
|
1010
|
+
|
1011
|
+
protected
|
1012
|
+
|
1013
|
+
# Returns a key for the IIIF manifest based on the holder and shelfmark.
|
1014
|
+
#
|
1015
|
+
# @param holder [String] the holder of the IIIF manifest
|
1016
|
+
# @param shelfmark [String] the shelfmark of the IIIF manifest
|
1017
|
+
# @return [String] the normalized key for the IIIF manifest
|
1018
|
+
def iiif_manifest_key holder, shelfmark
|
1019
|
+
qid = DS::Institutions.find_qid holder
|
1020
|
+
raise DSError, "No QID found for #{holder}" if qid.blank?
|
1021
|
+
normalize_key qid, shelfmark
|
1022
|
+
end
|
1023
|
+
|
1024
|
+
|
1025
|
+
# Returns a normalized key by joining and downcasing the input strings and removing whitespace.
|
1026
|
+
# @param strings [Array<String>] the strings to join and normalize
|
1027
|
+
# @return [String] the normalized key
|
1028
|
+
def normalize_key *strings
|
1029
|
+
strings.join.downcase.gsub(%r{\s+}, '')
|
1030
|
+
end
|
1031
|
+
|
1032
|
+
# A method to clean and process notes by removing whitespace, skipping notes with specific prefixes, and adding periods to notes without terminal punctuation.
|
1033
|
+
#
|
1034
|
+
# @param notes [Array<String>] the array of notes to be cleaned and processed
|
1035
|
+
# @return [Array<String>] the cleaned and processed notes as an array
|
1036
|
+
def clean_notes notes
|
1037
|
+
notes.flat_map { |note|
|
1038
|
+
# get node text and clean whitespace
|
1039
|
+
note.to_s.strip.gsub(%r{\s+}, ' ')
|
1040
|
+
}.uniq.reject { |note|
|
1041
|
+
# skip notes with prefixes like 'lang: '
|
1042
|
+
note.to_s =~ %r{\blang:\s*}i
|
1043
|
+
}.map { |note|
|
1044
|
+
# add period to any note without terminal punctuation: .,;:? or !
|
1045
|
+
DS::Util.terminate(note, terminator: '.', force: true)
|
1046
|
+
}
|
1047
|
+
end
|
1048
|
+
|
1049
|
+
@@ark_cache = nil
|
1050
|
+
|
1051
|
+
##
|
1052
|
+
# Rather than follow the ARK URLs to retrieve the locations, use a
|
1053
|
+
# cache that maps the arks to the TIFF filenames.
|
1054
|
+
#
|
1055
|
+
# Cache format:
|
1056
|
+
#
|
1057
|
+
# http://nma.berkeley.edu/ark:/28722/bk00091894z|dummy_MoConA_0000068.tif
|
1058
|
+
# http://nma.berkeley.edu/ark:/28722/bk00091895h|dummy_MoConA_0000069.tif
|
1059
|
+
# http://nma.berkeley.edu/ark:/28722/bk000918b51|dummy_MoConA_0000070.tif
|
1060
|
+
# http://nma.berkeley.edu/ark:/28722/bk000918b6k|dummy_MoConA_0000071.tif
|
1061
|
+
#
|
1062
|
+
# This method lazily initializes a hash that maps the URL to the file name.
|
1063
|
+
#
|
1064
|
+
# @param [String] address the ark URL; e.g.,
|
1065
|
+
# +http://nma.berkeley.edu/ark:/28722/bk000919772+
|
1066
|
+
# @return [String] the filename associated with +address+ or +nil+
|
1067
|
+
def search_ark_cache address
|
1068
|
+
if @@ark_cache.nil?
|
1069
|
+
STDERR.puts "Creating ARK cache"
|
1070
|
+
path = File.expand_path '../data/berkeley-arks.txt', __FILE__
|
1071
|
+
@@ark_cache = File.readlines(path).inject({}) { |h, line|
|
1072
|
+
ark, filename = line.strip.split '|'
|
1073
|
+
h.update({ ark => filename })
|
1074
|
+
}
|
1075
|
+
end
|
1076
|
+
@@ark_cache[address]
|
1077
|
+
end
|
1078
|
+
|
1079
|
+
##
|
1080
|
+
# Extract filename by following DS ARK URL (e.g.,
|
1081
|
+
# +http://nma.berkeley.edu/ark:/28722/bk000855n2z+). We can't get
|
1082
|
+
# the image, but we can get the filename from the redirect location
|
1083
|
+
# header. As soon as we get a location that ends in +.tif+, we extract
|
1084
|
+
# the basename and return it.
|
1085
|
+
#
|
1086
|
+
# We limit the number of redirects to 4 to prevent infinite recursion
|
1087
|
+
# following redirects. We should always get the filename in the first
|
1088
|
+
# call.
|
1089
|
+
#
|
1090
|
+
# @param [String] address ARK address of an image file
|
1091
|
+
# @param [Integer] limit decrementing count of recursive calls; stops
|
1092
|
+
# at +0+
|
1093
|
+
# @return [String] the basename of the first +.tif+ file encountered
|
1094
|
+
def locate_filename address, limit = 4
|
1095
|
+
# Before hitting the web, try the ARK/URL to FILE cache
|
1096
|
+
return search_ark_cache address if search_ark_cache address
|
1097
|
+
|
1098
|
+
STDERR.puts "WARNING -- recursion: location='#{address}', limit=#{limit}" if limit < 4
|
1099
|
+
return if limit == 0
|
1100
|
+
|
1101
|
+
resp = Net::HTTP.get_response URI address
|
1102
|
+
location = resp['location']
|
1103
|
+
return if location.nil?
|
1104
|
+
# recurse if location isn't a TIFF file
|
1105
|
+
return locate_filename location, limit - 1 unless location =~ %r{\.tif$}
|
1106
|
+
|
1107
|
+
File.basename URI(location).path
|
1108
|
+
end
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
self.extend ClassMethods
|
1112
|
+
end
|
1113
|
+
end
|
1114
|
+
end
|