ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,687 @@
|
|
1
|
+
module DS
|
2
|
+
module Extractor
|
3
|
+
module TeiXml
|
4
|
+
|
5
|
+
RESP_FORMER_OWNER = 'former owner'
|
6
|
+
RESP_SCRIBE = 'scribe'
|
7
|
+
RESP_ARTIST = 'artist'
|
8
|
+
MS_CREATOR_RESPS = [
|
9
|
+
RESP_FORMER_OWNER,
|
10
|
+
RESP_SCRIBE,
|
11
|
+
RESP_ARTIST
|
12
|
+
]
|
13
|
+
|
14
|
+
RESP_CATALOGER = 'cataloger'
|
15
|
+
RESP_CONTRIBUTOR = 'contributor'
|
16
|
+
ACKNOWLEDGMENT_RESPS = [
|
17
|
+
RESP_CATALOGER,
|
18
|
+
RESP_CONTRIBUTOR,
|
19
|
+
]
|
20
|
+
|
21
|
+
module ClassMethods
|
22
|
+
|
23
|
+
|
24
|
+
############################################################
|
25
|
+
# SOURCE METADATA
|
26
|
+
############################################################
|
27
|
+
def extract_cataloging_convention record
|
28
|
+
'tei-xml'
|
29
|
+
end
|
30
|
+
|
31
|
+
############################################################
|
32
|
+
# NAMES
|
33
|
+
############################################################
|
34
|
+
|
35
|
+
# Extracts authors from the given XML record.
|
36
|
+
#
|
37
|
+
# @param [Nokogiri::XML:Node] xml the XML record to extract authors from
|
38
|
+
# @return [Array<DS::Extractor::Name>] list of extracted author names
|
39
|
+
def extract_authors xml
|
40
|
+
names = []
|
41
|
+
xml.xpath('//msContents/msItem/author').map do |node|
|
42
|
+
next if node.text =~ /Free Library of Philadelphia/
|
43
|
+
|
44
|
+
name_node = node.at_xpath('(name|persName)[not(@type = "vernacular")]')
|
45
|
+
prenormal = name_node ? name_node.text : node.text
|
46
|
+
as_recorded = DS::Util.normalize_string prenormal
|
47
|
+
|
48
|
+
ref = node['ref']
|
49
|
+
ref = name_node['ref'] if name_node
|
50
|
+
role = 'author'
|
51
|
+
vern_name = node.at_xpath('(persName|name)[@type = "vernacular"]')
|
52
|
+
vernacular = DS::Util.normalize_string(vern_name.text) if vern_name
|
53
|
+
|
54
|
+
params = {
|
55
|
+
as_recorded: as_recorded,
|
56
|
+
ref: ref,
|
57
|
+
role: role,
|
58
|
+
vernacular: vernacular
|
59
|
+
}
|
60
|
+
names << DS::Extractor::Name.new(**params)
|
61
|
+
end
|
62
|
+
names
|
63
|
+
end
|
64
|
+
|
65
|
+
# Extract authors as recorded from the given XML record.
|
66
|
+
#
|
67
|
+
# @param [Nokogiri::XML:Node] xml a TEI XML record
|
68
|
+
# @return [Array<String>] list of authors as recorded
|
69
|
+
def extract_authors_as_recorded xml
|
70
|
+
extract_authors(xml).map(&:as_recorded)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Extracts authors as recorded with vernacular form from the given XML record.
|
74
|
+
#
|
75
|
+
# @param [Nokogiri::XML:Node] xml a TEI XML record
|
76
|
+
# @return [Array<String>] the extracted authors as recorded with vernacular form
|
77
|
+
def extract_authors_as_recorded_agr xml
|
78
|
+
extract_authors(xml).map(&:vernacular)
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
##
|
83
|
+
# All respStmts for the given +resp+ (e.g., 'artist') and return
|
84
|
+
# the values as Name instances
|
85
|
+
#
|
86
|
+
# @param [Nokogiri::XML::NodeSet] xml the parsed TEI XML
|
87
|
+
# @return [Array<Name>]
|
88
|
+
def extract_resps xml, *resp_names
|
89
|
+
# There are a variety of respStmt patterns; for example:
|
90
|
+
#
|
91
|
+
# <respStmt>
|
92
|
+
# <resp>former owner</resp>
|
93
|
+
# <persName type="authority">Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
|
94
|
+
# <persName type="vernacular">يوسف بن شيخ محمد الجمالي.</persName>
|
95
|
+
# </respStmt>
|
96
|
+
#
|
97
|
+
# <respStmt>
|
98
|
+
# <resp>former owner</resp>
|
99
|
+
# <persName type="authority">Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
|
100
|
+
# </respStmt>
|
101
|
+
#
|
102
|
+
# <respStmt>
|
103
|
+
# <resp>former owner</resp>
|
104
|
+
# <persName>Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
|
105
|
+
# </respStmt>
|
106
|
+
#
|
107
|
+
# <respStmt>
|
108
|
+
# <resp>former owner</resp>
|
109
|
+
# <name>Jamālī, Yūsuf ibn Shaykh Muḥammad</name>
|
110
|
+
# </respStmt>
|
111
|
+
#
|
112
|
+
#
|
113
|
+
resp_query = resp_names.map { |t|
|
114
|
+
%Q{contains(translate(./resp/text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '#{t.to_s.strip.downcase}')}
|
115
|
+
}.join ' or '
|
116
|
+
|
117
|
+
xpath = "//respStmt[#{resp_query}]"
|
118
|
+
xml.xpath(xpath).map { |node|
|
119
|
+
|
120
|
+
auth_name = node.at_xpath('(persName|name)[not(@type = "vernacular")]')
|
121
|
+
as_recorded = DS::Util.normalize_string(auth_name.text) if auth_name
|
122
|
+
ref = auth_name['ref'] if auth_name
|
123
|
+
vern_name = node.at_xpath('(persName|name)[@type = "vernacular"]')
|
124
|
+
vernacular = DS::Util.normalize_string(vern_name.text) if vern_name
|
125
|
+
resp = node.at_xpath('resp/text()').to_s
|
126
|
+
|
127
|
+
params = {
|
128
|
+
as_recorded: as_recorded,
|
129
|
+
ref: ref,
|
130
|
+
role: resp.downcase.strip,
|
131
|
+
vernacular: vernacular
|
132
|
+
}
|
133
|
+
DS::Extractor::Name.new **params
|
134
|
+
}
|
135
|
+
end
|
136
|
+
|
137
|
+
##
|
138
|
+
# All names, authors, and names with resps: former owner, scribe,
|
139
|
+
# artist with returned as two-dimensional array with each row
|
140
|
+
# having these values:
|
141
|
+
#
|
142
|
+
# * name as recorded
|
143
|
+
# * role (author, former owner, etc.)
|
144
|
+
# * name in vernacular script
|
145
|
+
# * ref (authority URL)
|
146
|
+
#
|
147
|
+
# All missing values are returned as +nil+:
|
148
|
+
#
|
149
|
+
# [
|
150
|
+
# ["Horace", "author", nil, "https://viaf.org/viaf/100227522/"],
|
151
|
+
# ["Hodossy, Imre", "former owner", nil, nil],
|
152
|
+
# ["Jān Sipār Khān ibn Rustamdilkhān, -1701?", "former owner", "جان سپار خان بن رستمدلخان،", nil]
|
153
|
+
# ]
|
154
|
+
#
|
155
|
+
# @param [Nokogiri::XML::NodeSet] xml the parsed TEI XML
|
156
|
+
# @return [Array<Name>]
|
157
|
+
def extract_recon_names xml
|
158
|
+
data = []
|
159
|
+
|
160
|
+
data += extract_authors(xml).map(&:to_a)
|
161
|
+
data += extract_resps(xml, *MS_CREATOR_RESPS).map(&:to_a)
|
162
|
+
|
163
|
+
data
|
164
|
+
end
|
165
|
+
|
166
|
+
# Extracts artists as recorded from the given record.
|
167
|
+
#
|
168
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
169
|
+
# @return [Array<String>] the extracted artists as recorded
|
170
|
+
def extract_artists_as_recorded xml
|
171
|
+
extract_artists(xml).map(&:as_recorded)
|
172
|
+
end
|
173
|
+
|
174
|
+
# Extracts artists as recorded with vernacular form from the given XML record.
|
175
|
+
#
|
176
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
177
|
+
# @return [Array<String>] the extracted artists as recorded with vernacular form
|
178
|
+
def extract_artists_as_recorded_agr xml
|
179
|
+
extract_artists(xml).map(&:vernacular)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Extracts artists from the given XML record.
|
183
|
+
#
|
184
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
185
|
+
# @return [Array<String>] the extracted artists
|
186
|
+
def extract_artists xml
|
187
|
+
extract_resps(xml, RESP_ARTIST)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Extracts scribes as recorded from the given XML record.
|
191
|
+
#
|
192
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
193
|
+
# @return [Array<String>] the extracted scribes as recorded
|
194
|
+
def extract_scribes_as_recorded xml
|
195
|
+
extract_scribes(xml).map &:as_recorded
|
196
|
+
end
|
197
|
+
|
198
|
+
# Extracts scribes as recorded with vernacular form from the given XML record.
|
199
|
+
#
|
200
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
201
|
+
# @return [Array<String>] the extracted scribes as recorded with vernacular form
|
202
|
+
def extract_scribes_as_recorded_agr xml
|
203
|
+
extract_scribes(xml).map &:vernacular
|
204
|
+
end
|
205
|
+
|
206
|
+
# Extracts scribes from the given XML record.
|
207
|
+
#
|
208
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
209
|
+
# @return [Array<String>] the extracted scribes
|
210
|
+
def extract_scribes xml
|
211
|
+
extract_resps(xml, RESP_SCRIBE)
|
212
|
+
end
|
213
|
+
|
214
|
+
# Extracts former owners as recorded from the given XML record.
|
215
|
+
#
|
216
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
217
|
+
# @return [Array<String>] the extracted former owners as recorded
|
218
|
+
def extract_former_owners_as_recorded xml
|
219
|
+
extract_former_owners(xml).map &:as_recorded
|
220
|
+
end
|
221
|
+
|
222
|
+
# Extracts former owners as recorded with vernacular form from the given XML record.
|
223
|
+
#
|
224
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
225
|
+
# @return [Array<String>] the extracted former owners as recorded with vernacular form
|
226
|
+
def extract_former_owners_as_recorded_agr xml
|
227
|
+
extract_former_owners(xml).map &:vernacular
|
228
|
+
end
|
229
|
+
|
230
|
+
# Extracts former owners from the given XML record.
|
231
|
+
#
|
232
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
233
|
+
# @return [Array<String>] the extracted former owners
|
234
|
+
def extract_former_owners xml
|
235
|
+
extract_resps(xml, RESP_FORMER_OWNER)
|
236
|
+
end
|
237
|
+
|
238
|
+
# Extracts associated agents from the given XML record.
|
239
|
+
#
|
240
|
+
# NB: Associated agents are not extracted from TEI XML. This
|
241
|
+
# method returns an empty array.
|
242
|
+
#
|
243
|
+
# @param [Nokogiri::XML::Node] xml the parsed TEI XML
|
244
|
+
# @return [Array] an empty array
|
245
|
+
def extract_associated_agents xml
|
246
|
+
[]
|
247
|
+
end
|
248
|
+
|
249
|
+
#########################################################################
|
250
|
+
# Miscellaneous authority values
|
251
|
+
#########################################################################
|
252
|
+
|
253
|
+
# Extracts the material as recorded from the given TEI XML record.
|
254
|
+
#
|
255
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
256
|
+
# @return [String] the extracted material as recorded
|
257
|
+
def extract_material_as_recorded record
|
258
|
+
extract_materials(record).map(&:as_recorded).first
|
259
|
+
end
|
260
|
+
|
261
|
+
# Extracts materials from the given TEI XML record.
|
262
|
+
#
|
263
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
264
|
+
# @return [Array<DS::Extractor::Material>] the extracted materials
|
265
|
+
def extract_materials record
|
266
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/support/p'
|
267
|
+
extract_normalized_strings(record, xpath).map { |material|
|
268
|
+
DS::Extractor::Material.new as_recorded: material
|
269
|
+
}
|
270
|
+
end
|
271
|
+
|
272
|
+
# Extracts the languages as recorded from the given XML with an optional separator.
|
273
|
+
#
|
274
|
+
# @param [Nokogiri::XML::Node] xml the XML node containing language information
|
275
|
+
# @param [String] separator the separator to use when multiple languages are extracted
|
276
|
+
# @return [Array<String>] the extracted languages as recorded
|
277
|
+
def extract_languages_as_recorded xml, separator: '|'
|
278
|
+
extract_languages(xml).map &:as_recorded
|
279
|
+
end
|
280
|
+
|
281
|
+
##
|
282
|
+
# Extract language the ISO codes from +textLang+ attributes +@mainLang+ and
|
283
|
+
# +@otherLangs+ and return as a pipe separated list.
|
284
|
+
#
|
285
|
+
# @param [Nokogiri::XML::Node] xml the TEI xml
|
286
|
+
# @return [String]
|
287
|
+
def extract_language_codes xml, separator: '|'
|
288
|
+
extract_languages(xml).map &:codes
|
289
|
+
end
|
290
|
+
|
291
|
+
# Extracts the languages from the given TEI XML record using the specified xpath.
|
292
|
+
# Each language is mapped to a Language object containing the language as recorded and its ISO codes.
|
293
|
+
#
|
294
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
295
|
+
# @return [Array<DS::Extractor::Language>] list of Language objects
|
296
|
+
def extract_languages record
|
297
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msContents/textLang'
|
298
|
+
record.xpath(xpath).map { |text_lang|
|
299
|
+
codes = Set.new
|
300
|
+
codes << text_lang['mainLang']
|
301
|
+
codes += text_lang['otherLang'].to_s.split
|
302
|
+
if text_lang.text.present?
|
303
|
+
as_recorded = text_lang.text
|
304
|
+
else
|
305
|
+
as_recorded = codes.join '|'
|
306
|
+
end
|
307
|
+
|
308
|
+
DS::Extractor::Language.new as_recorded: as_recorded, codes: codes
|
309
|
+
}
|
310
|
+
end
|
311
|
+
|
312
|
+
#########################################################################
|
313
|
+
# Genres and subjects
|
314
|
+
#########################################################################
|
315
|
+
|
316
|
+
# Extracts genre terms from the given TEI XML record.
|
317
|
+
#
|
318
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
319
|
+
# @return [Array<Array>] an array of arrays containing value, vocabulary, and number for each term
|
320
|
+
def extract_recon_genres record
|
321
|
+
xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="form/genre"]/term'
|
322
|
+
record.xpath(xpath).map { |term|
|
323
|
+
value = DS::Util.normalize_string term.text
|
324
|
+
vocab = 'openn-form/genre'
|
325
|
+
number = term['target']
|
326
|
+
[value, vocab, number]
|
327
|
+
}
|
328
|
+
end
|
329
|
+
|
330
|
+
# Extracts subject terms from the given TEI XML record.
|
331
|
+
#
|
332
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
333
|
+
# @return [Array] an array containing value, subfield codes, vocabulary, and number for each term
|
334
|
+
def extract_recon_subjects xml
|
335
|
+
xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="subjects" or @n="keywords"]/term'
|
336
|
+
xml.xpath(xpath).map do |term|
|
337
|
+
value = DS::Util.normalize_string term.text
|
338
|
+
subfield_codes = nil
|
339
|
+
vocab = "openn-#{term.parent['n']}"
|
340
|
+
number = term['target']
|
341
|
+
[value, subfield_codes, vocab, number]
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Extracts genres from the given TEI XML record as recorded.
|
346
|
+
#
|
347
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
348
|
+
# @return [Array<String>] the extracted genres
|
349
|
+
def extract_genres_as_recorded xml
|
350
|
+
extract_genres(xml).map &:as_recorded
|
351
|
+
end
|
352
|
+
|
353
|
+
# Extracts genres from the given TEI XML record as recorded.
|
354
|
+
#
|
355
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
356
|
+
# @return [Array<DS::Extractor::Genre>] the extracted genres
|
357
|
+
def extract_genres xml
|
358
|
+
xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="form/genre"]/term'
|
359
|
+
xml.xpath(xpath).map { |term|
|
360
|
+
|
361
|
+
as_recorded = DS::Util.normalize_string term.text
|
362
|
+
vocab = 'openn-form/genre'
|
363
|
+
source_authority_uri = term['target']
|
364
|
+
DS::Extractor::Genre.new as_recorded: as_recorded, vocab: vocab, source_authority_uri: source_authority_uri
|
365
|
+
}
|
366
|
+
end
|
367
|
+
|
368
|
+
# Extracts subjects from the given TEI XML record as recorded.
|
369
|
+
#
|
370
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
371
|
+
# @return [Array<String>] the extracted subjects
|
372
|
+
def extract_subjects_as_recorded xml
|
373
|
+
extract_subjects(xml).map &:as_recorded
|
374
|
+
end
|
375
|
+
|
376
|
+
# Extracts all subjects from the given TEI XML record as recorded.
|
377
|
+
#
|
378
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
379
|
+
# @return [Array<String>] the extracted subjects
|
380
|
+
def extract_all_subjects_as_recorded xml
|
381
|
+
extract_subjects_as_recorded xml
|
382
|
+
end
|
383
|
+
|
384
|
+
def extract_all_subjects xml
|
385
|
+
extract_subjects xml
|
386
|
+
end
|
387
|
+
|
388
|
+
# Extracts subjects from the given TEI XML record as recorded.
|
389
|
+
#
|
390
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
391
|
+
# @return [Array<DS::Extractor::Subject>] the extracted subjects
|
392
|
+
def extract_subjects xml
|
393
|
+
xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="subjects" or @n="keywords"]/term'
|
394
|
+
xml.xpath(xpath).map { |subject|
|
395
|
+
subject_type = "openn-#{subject.parent['n']}"
|
396
|
+
as_recorded = DS::Util.normalize_string subject.text
|
397
|
+
DS::Extractor::Subject.new as_recorded: as_recorded, vocab: subject_type
|
398
|
+
}
|
399
|
+
end
|
400
|
+
|
401
|
+
#########################################################################
|
402
|
+
# Place of production
|
403
|
+
#########################################################################
|
404
|
+
|
405
|
+
# Extracts the places of production from the given TEI XML record as recorded.
|
406
|
+
#
|
407
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
408
|
+
# @return [Array<String>] the extracted places of production as recorded
|
409
|
+
def extract_production_places_as_recorded record
|
410
|
+
extract_places(record).map &:as_recorded
|
411
|
+
end
|
412
|
+
|
413
|
+
# Extracts places from the given TEI XML record as recorded.
|
414
|
+
#
|
415
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
416
|
+
# @return [Array<DS::Extractor::Place>] the extracted places
|
417
|
+
def extract_places record
|
418
|
+
xpath = '//origPlace'
|
419
|
+
extract_normalized_strings(record, xpath).map { |place|
|
420
|
+
DS::Extractor::Place.new as_recorded: place
|
421
|
+
}
|
422
|
+
end
|
423
|
+
|
424
|
+
##
|
425
|
+
# Extract the places of production for reconciliation CSV output.
|
426
|
+
#
|
427
|
+
# Returns a two-dimensional array, each row is a place; and each row has
|
428
|
+
# one column: place name; for example:
|
429
|
+
#
|
430
|
+
# [["Austria"],
|
431
|
+
# ["Germany"],
|
432
|
+
# ["France (?)"]]
|
433
|
+
#
|
434
|
+
# @param [Nokogiri::XML:Node] xml a +<TEI>+ node
|
435
|
+
# @return [Array<Array>] an array of arrays of values
|
436
|
+
def extract_recon_places xml
|
437
|
+
xpath = '//origPlace/text()'
|
438
|
+
extract_normalized_strings(xml, xpath).map { |place| [place] }
|
439
|
+
end
|
440
|
+
|
441
|
+
#########################################################################
|
442
|
+
# Date of production
|
443
|
+
#########################################################################
|
444
|
+
|
445
|
+
# Extracts the date of production from the given TEI XML record as recorded.
|
446
|
+
#
|
447
|
+
# @param [Nokogiri::XML::Node] xml the TEI XML record
|
448
|
+
# @param [String] range_sep the separator for the date range
|
449
|
+
# @return [Array<String>] the extracted dates of production as recorded
|
450
|
+
def extract_production_date_as_recorded xml, range_sep: '-'
|
451
|
+
extract_date_range(xml, range_sep: range_sep)
|
452
|
+
end
|
453
|
+
|
454
|
+
# Extracts and formats date ranges as recorded in the given TEI XML record.
|
455
|
+
#
|
456
|
+
# @param [Nokogiri::XML::Node] record the TEI XML record
|
457
|
+
# @param [String] range_sep the separator for the date range
|
458
|
+
# @return [Array<String>] an array of formatted date ranges
|
459
|
+
def extract_date_range record, range_sep:
|
460
|
+
record.xpath('//origDate').map { |orig|
|
461
|
+
orig.xpath('@notBefore|@notAfter').map { |d| d.text.to_i }.sort.join(range_sep)
|
462
|
+
}
|
463
|
+
end
|
464
|
+
|
465
|
+
#########################################################################
|
466
|
+
# Titles
|
467
|
+
#########################################################################
|
468
|
+
|
469
|
+
##
|
470
|
+
# Return an array of Title instances equal in number to
|
471
|
+
# the number of non-vernacular titles.
|
472
|
+
#
|
473
|
+
# This is a bit of a hack. Titles are list serially and Roman-
|
474
|
+
# character and vernacular script titles are not paired. Thus:
|
475
|
+
#
|
476
|
+
# <msItem>
|
477
|
+
# <title>Qaṭr al-nadā wa-ball al-ṣadā.</title>
|
478
|
+
# <title type="vernacular">قطر الندا وبل الصدا</title>
|
479
|
+
# <title>Second title</title>
|
480
|
+
# <author>
|
481
|
+
# <!-- ... -->
|
482
|
+
# </msItem>
|
483
|
+
#
|
484
|
+
# We assume that, when there is a vernacular title, it follows
|
485
|
+
# its Roman equivalent. This script runs through all +<title>+
|
486
|
+
# elements and creates a Title struct for each title where
|
487
|
+
#
|
488
|
+
# @type != 'vernacular'
|
489
|
+
#
|
490
|
+
# When +@type+ is 'vernacular' is sets the +as_recorded_agr+
|
491
|
+
# of the previous Title instance to that value.
|
492
|
+
#
|
493
|
+
# @param [Nokogiri::XML::Node] record the TEI record
|
494
|
+
# @return [Array<Title>]
|
495
|
+
def extract_titles record
|
496
|
+
titles = []
|
497
|
+
record.xpath('//msItem[1]/title').each do |title|
|
498
|
+
if title[:type] != 'vernacular'
|
499
|
+
titles << DS::Extractor::Title.new(
|
500
|
+
as_recorded: DS::Util.normalize_string(title.text)
|
501
|
+
)
|
502
|
+
else
|
503
|
+
titles.last.vernacular = DS::Util.normalize_string title.text
|
504
|
+
end
|
505
|
+
end
|
506
|
+
titles
|
507
|
+
end
|
508
|
+
|
509
|
+
# Extracts the titles from the given TEI record as recorded.
|
510
|
+
#
|
511
|
+
# @param [Nokogiri::XML::Node] record the TEI record
|
512
|
+
# @return [Array<String>] list of titles as recorded
|
513
|
+
def extract_titles_as_recorded record
|
514
|
+
extract_titles(record).map { |t| t.as_recorded }
|
515
|
+
end
|
516
|
+
|
517
|
+
# Extracts the titles from the given TEI record as recorded in the vernacular language.
|
518
|
+
#
|
519
|
+
# @param [Nokogiri::XML::Node] record the TEI record
|
520
|
+
# @return [Array<String>] list of titles in the vernacular language as recorded
|
521
|
+
def extract_titles_as_recorded_agr record
|
522
|
+
extract_titles(record).map { |t| t.vernacular }
|
523
|
+
end
|
524
|
+
|
525
|
+
# Extracts the titles from the given TEI record to an array of titles.
|
526
|
+
#
|
527
|
+
# @param [Nokogiri::XML::Node] xml the TEI record
|
528
|
+
# @return [Array<Array>] list of titles converted to arrays
|
529
|
+
def extract_recon_titles xml
|
530
|
+
extract_titles(xml).map { |t| t.to_a }
|
531
|
+
end
|
532
|
+
|
533
|
+
#########################################################################
|
534
|
+
# Physical description
|
535
|
+
#########################################################################
|
536
|
+
##
|
537
|
+
# Return the extent and support concatenated; e.g.,
|
538
|
+
#
|
539
|
+
# @param [Nokogiri::XML::Node] xml the TEI xml
|
540
|
+
# @return [String]
|
541
|
+
def extract_physical_description xml
|
542
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/extent/text()'
|
543
|
+
extent = extract_normalized_strings(xml, xpath).first
|
544
|
+
extent = "Extent: #{extent}" unless extent.blank?
|
545
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/support/p/text()'
|
546
|
+
support = extract_normalized_strings(xml, xpath).first
|
547
|
+
|
548
|
+
desc = [extent, support].reject(&:blank?).join('; ').capitalize
|
549
|
+
[desc]
|
550
|
+
end
|
551
|
+
|
552
|
+
#########################################################################
|
553
|
+
# Notes
|
554
|
+
#########################################################################
|
555
|
+
SIMPLE_NOTE_XPATH = '/TEI/teiHeader/fileDesc/notesStmt/note[not(@type)]/text()'
|
556
|
+
BINDING_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/bindingDesc/binding/p/text()'
|
557
|
+
LAYOUT_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/layoutDesc/layout/text()'
|
558
|
+
SCRIPT_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/scriptDesc/scriptNote/text()'
|
559
|
+
DECO_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/decoDesc/decoNote[not(@n)]/text()'
|
560
|
+
RESOURCE_XPATH = '/TEI/teiHeader/fileDesc/notesStmt/note[@type = "relatedResource"]/text()'
|
561
|
+
PROVENANCE_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/history/provenance/text()'
|
562
|
+
|
563
|
+
##
|
564
|
+
# Create an array of notes. Physical description notes, like
|
565
|
+
# Binding, and Layout are mapped as prefixed notes as with TEI:
|
566
|
+
#
|
567
|
+
# Binding: The binding note.
|
568
|
+
# Layout: The layout note.
|
569
|
+
#
|
570
|
+
# @param [Nokogiri::XML::Node] xml the TEI xml
|
571
|
+
# @return [Array<String>]
|
572
|
+
def extract_notes xml
|
573
|
+
notes = []
|
574
|
+
|
575
|
+
notes += build_notes xml, SIMPLE_NOTE_XPATH
|
576
|
+
notes += build_notes xml, BINDING_XPATH, prefix: "Binding"
|
577
|
+
notes += build_notes xml, LAYOUT_XPATH, prefix: "Layout"
|
578
|
+
notes += build_notes xml, SCRIPT_XPATH, prefix: "Script"
|
579
|
+
notes += build_notes xml, DECO_XPATH, prefix: "Decoration"
|
580
|
+
notes += build_notes xml, RESOURCE_XPATH, prefix: "Related resource"
|
581
|
+
notes += build_notes xml, PROVENANCE_XPATH, prefix: "Provenance"
|
582
|
+
|
583
|
+
notes
|
584
|
+
end
|
585
|
+
|
586
|
+
WHITESPACE_RE = %r{\s+}
|
587
|
+
MEDIAL_PIPE_RE = %r{\s*\|\s*} # match pipes
|
588
|
+
|
589
|
+
##
|
590
|
+
# Clean the note text and optionally a prefix. The prefix is
|
591
|
+
# prepended as:
|
592
|
+
#
|
593
|
+
# "#{prefix}: Note text"
|
594
|
+
#
|
595
|
+
# @param [Nokogiri::XML::Node] xml the TEI xml
|
596
|
+
# @param [String] xpath the xpath for the note(s)
|
597
|
+
# @param [String] prefix value to prepend to the note; default: +nil+
|
598
|
+
# @return [Array<String>]
|
599
|
+
def build_notes xml, xpath, prefix: nil
|
600
|
+
pref = prefix.blank? ? '' : "#{prefix}: "
|
601
|
+
extract_normalized_strings(xml, xpath).map { |value|
|
602
|
+
"#{pref}#{value}"
|
603
|
+
}
|
604
|
+
end
|
605
|
+
|
606
|
+
#########################################################################
|
607
|
+
# Holding information
|
608
|
+
#########################################################################
|
609
|
+
|
610
|
+
# Extracts the holding institution from the given record.
|
611
|
+
#
|
612
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
613
|
+
# @return [String] the extracted holding institution
|
614
|
+
def extract_holding_institution record
|
615
|
+
xpath = '(//msIdentifier/institution|//msIdentifier/repository)[1]'
|
616
|
+
extract_normalized_strings(record, xpath).first
|
617
|
+
end
|
618
|
+
|
619
|
+
# Extracts the holding institution id number from the given record.
|
620
|
+
#
|
621
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
622
|
+
# @return [String] the extracted holding institution id number
|
623
|
+
def extract_holding_institution_id_nummber record
|
624
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msIdentifier/altIdentifier[@type="bibid"]/idno'
|
625
|
+
extract_normalized_strings(record, xpath).first
|
626
|
+
end
|
627
|
+
|
628
|
+
# Extracts the shelfmark from the given record.
|
629
|
+
#
|
630
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
631
|
+
# @return [String] the extracted shelfmark
|
632
|
+
def extract_shelfmark record
|
633
|
+
xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msIdentifier/idno[@type="call-number"]'
|
634
|
+
extract_normalized_strings(record, xpath).first
|
635
|
+
end
|
636
|
+
|
637
|
+
# Extracts the link to the record from the given record.
|
638
|
+
#
|
639
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
640
|
+
# @return [String] the extracted link to the record
|
641
|
+
def extract_link_to_record record
|
642
|
+
xpath = '//altIdentifier[@type="resource"][1]/idno'
|
643
|
+
extract_normalized_strings(record, xpath).first
|
644
|
+
end
|
645
|
+
|
646
|
+
#########################################################################
|
647
|
+
# Acknowledgments
|
648
|
+
#########################################################################
|
649
|
+
|
650
|
+
# Extracts the funder information from the TEI XML record.
|
651
|
+
#
|
652
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
653
|
+
# @return [Array<String>] an array of funders extracted from the record
|
654
|
+
def extract_funder record
|
655
|
+
xpath = '/TEI/teiHeader/fileDesc/titleStmt/funder'
|
656
|
+
extract_normalized_strings(record, xpath).map { |name| "Funder: #{name}" }
|
657
|
+
end
|
658
|
+
|
659
|
+
# Extracts acknowledgments from the TEI XML record.
|
660
|
+
#
|
661
|
+
# @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
|
662
|
+
# @return [Array<String>] an array of acknowledgments extracted from the record
|
663
|
+
def extract_acknowledgments record
|
664
|
+
names = extract_resps(record, *ACKNOWLEDGMENT_RESPS).map { |name|
|
665
|
+
"#{name.role.capitalize}: #{name.as_recorded}"
|
666
|
+
}
|
667
|
+
names + extract_funder(record)
|
668
|
+
end
|
669
|
+
|
670
|
+
#########################################################################
|
671
|
+
# Utility methods
|
672
|
+
#########################################################################
|
673
|
+
|
674
|
+
# Extracts normalized strings from the given record based on the provided xpath.
|
675
|
+
#
|
676
|
+
# @param [Nokogiri::XML::Node] record the record to extract normalized strings from
|
677
|
+
# @param [String] xpath the xpath to specify the location of the strings in the record
|
678
|
+
# @return [Array<String>] an array of normalized strings extracted from the record
|
679
|
+
def extract_normalized_strings record, xpath
|
680
|
+
record.xpath(xpath).map { |node| DS::Util.normalize_string node.text }
|
681
|
+
end
|
682
|
+
end
|
683
|
+
|
684
|
+
self.extend ClassMethods
|
685
|
+
end
|
686
|
+
end
|
687
|
+
end
|