ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,687 @@
1
+ module DS
2
+ module Extractor
3
+ module TeiXml
4
+
5
+ RESP_FORMER_OWNER = 'former owner'
6
+ RESP_SCRIBE = 'scribe'
7
+ RESP_ARTIST = 'artist'
8
+ MS_CREATOR_RESPS = [
9
+ RESP_FORMER_OWNER,
10
+ RESP_SCRIBE,
11
+ RESP_ARTIST
12
+ ]
13
+
14
+ RESP_CATALOGER = 'cataloger'
15
+ RESP_CONTRIBUTOR = 'contributor'
16
+ ACKNOWLEDGMENT_RESPS = [
17
+ RESP_CATALOGER,
18
+ RESP_CONTRIBUTOR,
19
+ ]
20
+
21
+ module ClassMethods
22
+
23
+
24
+ ############################################################
25
+ # SOURCE METADATA
26
+ ############################################################
27
+ def extract_cataloging_convention record
28
+ 'tei-xml'
29
+ end
30
+
31
+ ############################################################
32
+ # NAMES
33
+ ############################################################
34
+
35
+ # Extracts authors from the given XML record.
36
+ #
37
+ # @param [Nokogiri::XML:Node] xml the XML record to extract authors from
38
+ # @return [Array<DS::Extractor::Name>] list of extracted author names
39
+ def extract_authors xml
40
+ names = []
41
+ xml.xpath('//msContents/msItem/author').map do |node|
42
+ next if node.text =~ /Free Library of Philadelphia/
43
+
44
+ name_node = node.at_xpath('(name|persName)[not(@type = "vernacular")]')
45
+ prenormal = name_node ? name_node.text : node.text
46
+ as_recorded = DS::Util.normalize_string prenormal
47
+
48
+ ref = node['ref']
49
+ ref = name_node['ref'] if name_node
50
+ role = 'author'
51
+ vern_name = node.at_xpath('(persName|name)[@type = "vernacular"]')
52
+ vernacular = DS::Util.normalize_string(vern_name.text) if vern_name
53
+
54
+ params = {
55
+ as_recorded: as_recorded,
56
+ ref: ref,
57
+ role: role,
58
+ vernacular: vernacular
59
+ }
60
+ names << DS::Extractor::Name.new(**params)
61
+ end
62
+ names
63
+ end
64
+
65
+ # Extract authors as recorded from the given XML record.
66
+ #
67
+ # @param [Nokogiri::XML:Node] xml a TEI XML record
68
+ # @return [Array<String>] list of authors as recorded
69
+ def extract_authors_as_recorded xml
70
+ extract_authors(xml).map(&:as_recorded)
71
+ end
72
+
73
+ # Extracts authors as recorded with vernacular form from the given XML record.
74
+ #
75
+ # @param [Nokogiri::XML:Node] xml a TEI XML record
76
+ # @return [Array<String>] the extracted authors as recorded with vernacular form
77
+ def extract_authors_as_recorded_agr xml
78
+ extract_authors(xml).map(&:vernacular)
79
+ end
80
+
81
+
82
+ ##
83
+ # All respStmts for the given +resp+ (e.g., 'artist') and return
84
+ # the values as Name instances
85
+ #
86
+ # @param [Nokogiri::XML::NodeSet] xml the parsed TEI XML
87
+ # @return [Array<Name>]
88
+ def extract_resps xml, *resp_names
89
+ # There are a variety of respStmt patterns; for example:
90
+ #
91
+ # <respStmt>
92
+ # <resp>former owner</resp>
93
+ # <persName type="authority">Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
94
+ # <persName type="vernacular">يوسف بن شيخ محمد الجمالي.</persName>
95
+ # </respStmt>
96
+ #
97
+ # <respStmt>
98
+ # <resp>former owner</resp>
99
+ # <persName type="authority">Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
100
+ # </respStmt>
101
+ #
102
+ # <respStmt>
103
+ # <resp>former owner</resp>
104
+ # <persName>Jamālī, Yūsuf ibn Shaykh Muḥammad</persName>
105
+ # </respStmt>
106
+ #
107
+ # <respStmt>
108
+ # <resp>former owner</resp>
109
+ # <name>Jamālī, Yūsuf ibn Shaykh Muḥammad</name>
110
+ # </respStmt>
111
+ #
112
+ #
113
+ resp_query = resp_names.map { |t|
114
+ %Q{contains(translate(./resp/text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '#{t.to_s.strip.downcase}')}
115
+ }.join ' or '
116
+
117
+ xpath = "//respStmt[#{resp_query}]"
118
+ xml.xpath(xpath).map { |node|
119
+
120
+ auth_name = node.at_xpath('(persName|name)[not(@type = "vernacular")]')
121
+ as_recorded = DS::Util.normalize_string(auth_name.text) if auth_name
122
+ ref = auth_name['ref'] if auth_name
123
+ vern_name = node.at_xpath('(persName|name)[@type = "vernacular"]')
124
+ vernacular = DS::Util.normalize_string(vern_name.text) if vern_name
125
+ resp = node.at_xpath('resp/text()').to_s
126
+
127
+ params = {
128
+ as_recorded: as_recorded,
129
+ ref: ref,
130
+ role: resp.downcase.strip,
131
+ vernacular: vernacular
132
+ }
133
+ DS::Extractor::Name.new **params
134
+ }
135
+ end
136
+
137
+ ##
138
+ # All names, authors, and names with resps: former owner, scribe,
139
+ # artist with returned as two-dimensional array with each row
140
+ # having these values:
141
+ #
142
+ # * name as recorded
143
+ # * role (author, former owner, etc.)
144
+ # * name in vernacular script
145
+ # * ref (authority URL)
146
+ #
147
+ # All missing values are returned as +nil+:
148
+ #
149
+ # [
150
+ # ["Horace", "author", nil, "https://viaf.org/viaf/100227522/"],
151
+ # ["Hodossy, Imre", "former owner", nil, nil],
152
+ # ["Jān Sipār Khān ibn Rustamdilkhān, -1701?", "former owner", "جان سپار خان بن رستمدلخان،", nil]
153
+ # ]
154
+ #
155
+ # @param [Nokogiri::XML::NodeSet] xml the parsed TEI XML
156
+ # @return [Array<Name>]
157
+ def extract_recon_names xml
158
+ data = []
159
+
160
+ data += extract_authors(xml).map(&:to_a)
161
+ data += extract_resps(xml, *MS_CREATOR_RESPS).map(&:to_a)
162
+
163
+ data
164
+ end
165
+
166
+ # Extracts artists as recorded from the given record.
167
+ #
168
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
169
+ # @return [Array<String>] the extracted artists as recorded
170
+ def extract_artists_as_recorded xml
171
+ extract_artists(xml).map(&:as_recorded)
172
+ end
173
+
174
+ # Extracts artists as recorded with vernacular form from the given XML record.
175
+ #
176
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
177
+ # @return [Array<String>] the extracted artists as recorded with vernacular form
178
+ def extract_artists_as_recorded_agr xml
179
+ extract_artists(xml).map(&:vernacular)
180
+ end
181
+
182
+ # Extracts artists from the given XML record.
183
+ #
184
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
185
+ # @return [Array<String>] the extracted artists
186
+ def extract_artists xml
187
+ extract_resps(xml, RESP_ARTIST)
188
+ end
189
+
190
+ # Extracts scribes as recorded from the given XML record.
191
+ #
192
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
193
+ # @return [Array<String>] the extracted scribes as recorded
194
+ def extract_scribes_as_recorded xml
195
+ extract_scribes(xml).map &:as_recorded
196
+ end
197
+
198
+ # Extracts scribes as recorded with vernacular form from the given XML record.
199
+ #
200
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
201
+ # @return [Array<String>] the extracted scribes as recorded with vernacular form
202
+ def extract_scribes_as_recorded_agr xml
203
+ extract_scribes(xml).map &:vernacular
204
+ end
205
+
206
+ # Extracts scribes from the given XML record.
207
+ #
208
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
209
+ # @return [Array<String>] the extracted scribes
210
+ def extract_scribes xml
211
+ extract_resps(xml, RESP_SCRIBE)
212
+ end
213
+
214
+ # Extracts former owners as recorded from the given XML record.
215
+ #
216
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
217
+ # @return [Array<String>] the extracted former owners as recorded
218
+ def extract_former_owners_as_recorded xml
219
+ extract_former_owners(xml).map &:as_recorded
220
+ end
221
+
222
+ # Extracts former owners as recorded with vernacular form from the given XML record.
223
+ #
224
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
225
+ # @return [Array<String>] the extracted former owners as recorded with vernacular form
226
+ def extract_former_owners_as_recorded_agr xml
227
+ extract_former_owners(xml).map &:vernacular
228
+ end
229
+
230
+ # Extracts former owners from the given XML record.
231
+ #
232
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
233
+ # @return [Array<String>] the extracted former owners
234
+ def extract_former_owners xml
235
+ extract_resps(xml, RESP_FORMER_OWNER)
236
+ end
237
+
238
+ # Extracts associated agents from the given XML record.
239
+ #
240
+ # NB: Associated agents are not extracted from TEI XML. This
241
+ # method returns an empty array.
242
+ #
243
+ # @param [Nokogiri::XML::Node] xml the parsed TEI XML
244
+ # @return [Array] an empty array
245
+ def extract_associated_agents xml
246
+ []
247
+ end
248
+
249
+ #########################################################################
250
+ # Miscellaneous authority values
251
+ #########################################################################
252
+
253
+ # Extracts the material as recorded from the given TEI XML record.
254
+ #
255
+ # @param [Nokogiri::XML::Node] record the TEI XML record
256
+ # @return [String] the extracted material as recorded
257
+ def extract_material_as_recorded record
258
+ extract_materials(record).map(&:as_recorded).first
259
+ end
260
+
261
+ # Extracts materials from the given TEI XML record.
262
+ #
263
+ # @param [Nokogiri::XML::Node] record the TEI XML record
264
+ # @return [Array<DS::Extractor::Material>] the extracted materials
265
+ def extract_materials record
266
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/support/p'
267
+ extract_normalized_strings(record, xpath).map { |material|
268
+ DS::Extractor::Material.new as_recorded: material
269
+ }
270
+ end
271
+
272
+ # Extracts the languages as recorded from the given XML with an optional separator.
273
+ #
274
+ # @param [Nokogiri::XML::Node] xml the XML node containing language information
275
+ # @param [String] separator the separator to use when multiple languages are extracted
276
+ # @return [Array<String>] the extracted languages as recorded
277
+ def extract_languages_as_recorded xml, separator: '|'
278
+ extract_languages(xml).map &:as_recorded
279
+ end
280
+
281
+ ##
282
+ # Extract language the ISO codes from +textLang+ attributes +@mainLang+ and
283
+ # +@otherLangs+ and return as a pipe separated list.
284
+ #
285
+ # @param [Nokogiri::XML::Node] xml the TEI xml
286
+ # @return [String]
287
+ def extract_language_codes xml, separator: '|'
288
+ extract_languages(xml).map &:codes
289
+ end
290
+
291
+ # Extracts the languages from the given TEI XML record using the specified xpath.
292
+ # Each language is mapped to a Language object containing the language as recorded and its ISO codes.
293
+ #
294
+ # @param [Nokogiri::XML::Node] record the TEI XML record
295
+ # @return [Array<DS::Extractor::Language>] list of Language objects
296
+ def extract_languages record
297
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msContents/textLang'
298
+ record.xpath(xpath).map { |text_lang|
299
+ codes = Set.new
300
+ codes << text_lang['mainLang']
301
+ codes += text_lang['otherLang'].to_s.split
302
+ if text_lang.text.present?
303
+ as_recorded = text_lang.text
304
+ else
305
+ as_recorded = codes.join '|'
306
+ end
307
+
308
+ DS::Extractor::Language.new as_recorded: as_recorded, codes: codes
309
+ }
310
+ end
311
+
312
+ #########################################################################
313
+ # Genres and subjects
314
+ #########################################################################
315
+
316
+ # Extracts genre terms from the given TEI XML record.
317
+ #
318
+ # @param [Nokogiri::XML::Node] record the TEI XML record
319
+ # @return [Array<Array>] an array of arrays containing value, vocabulary, and number for each term
320
+ def extract_recon_genres record
321
+ xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="form/genre"]/term'
322
+ record.xpath(xpath).map { |term|
323
+ value = DS::Util.normalize_string term.text
324
+ vocab = 'openn-form/genre'
325
+ number = term['target']
326
+ [value, vocab, number]
327
+ }
328
+ end
329
+
330
+ # Extracts subject terms from the given TEI XML record.
331
+ #
332
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
333
+ # @return [Array] an array containing value, subfield codes, vocabulary, and number for each term
334
+ def extract_recon_subjects xml
335
+ xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="subjects" or @n="keywords"]/term'
336
+ xml.xpath(xpath).map do |term|
337
+ value = DS::Util.normalize_string term.text
338
+ subfield_codes = nil
339
+ vocab = "openn-#{term.parent['n']}"
340
+ number = term['target']
341
+ [value, subfield_codes, vocab, number]
342
+ end
343
+ end
344
+
345
+ # Extracts genres from the given TEI XML record as recorded.
346
+ #
347
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
348
+ # @return [Array<String>] the extracted genres
349
+ def extract_genres_as_recorded xml
350
+ extract_genres(xml).map &:as_recorded
351
+ end
352
+
353
+ # Extracts genres from the given TEI XML record as recorded.
354
+ #
355
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
356
+ # @return [Array<DS::Extractor::Genre>] the extracted genres
357
+ def extract_genres xml
358
+ xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="form/genre"]/term'
359
+ xml.xpath(xpath).map { |term|
360
+
361
+ as_recorded = DS::Util.normalize_string term.text
362
+ vocab = 'openn-form/genre'
363
+ source_authority_uri = term['target']
364
+ DS::Extractor::Genre.new as_recorded: as_recorded, vocab: vocab, source_authority_uri: source_authority_uri
365
+ }
366
+ end
367
+
368
+ # Extracts subjects from the given TEI XML record as recorded.
369
+ #
370
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
371
+ # @return [Array<String>] the extracted subjects
372
+ def extract_subjects_as_recorded xml
373
+ extract_subjects(xml).map &:as_recorded
374
+ end
375
+
376
+ # Extracts all subjects from the given TEI XML record as recorded.
377
+ #
378
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
379
+ # @return [Array<String>] the extracted subjects
380
+ def extract_all_subjects_as_recorded xml
381
+ extract_subjects_as_recorded xml
382
+ end
383
+
384
+ def extract_all_subjects xml
385
+ extract_subjects xml
386
+ end
387
+
388
+ # Extracts subjects from the given TEI XML record as recorded.
389
+ #
390
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
391
+ # @return [Array<DS::Extractor::Subject>] the extracted subjects
392
+ def extract_subjects xml
393
+ xpath = '/TEI/teiHeader/profileDesc/textClass/keywords[@n="subjects" or @n="keywords"]/term'
394
+ xml.xpath(xpath).map { |subject|
395
+ subject_type = "openn-#{subject.parent['n']}"
396
+ as_recorded = DS::Util.normalize_string subject.text
397
+ DS::Extractor::Subject.new as_recorded: as_recorded, vocab: subject_type
398
+ }
399
+ end
400
+
401
+ #########################################################################
402
+ # Place of production
403
+ #########################################################################
404
+
405
+ # Extracts the places of production from the given TEI XML record as recorded.
406
+ #
407
+ # @param [Nokogiri::XML::Node] record the TEI XML record
408
+ # @return [Array<String>] the extracted places of production as recorded
409
+ def extract_production_places_as_recorded record
410
+ extract_places(record).map &:as_recorded
411
+ end
412
+
413
+ # Extracts places from the given TEI XML record as recorded.
414
+ #
415
+ # @param [Nokogiri::XML::Node] record the TEI XML record
416
+ # @return [Array<DS::Extractor::Place>] the extracted places
417
+ def extract_places record
418
+ xpath = '//origPlace'
419
+ extract_normalized_strings(record, xpath).map { |place|
420
+ DS::Extractor::Place.new as_recorded: place
421
+ }
422
+ end
423
+
424
+ ##
425
+ # Extract the places of production for reconciliation CSV output.
426
+ #
427
+ # Returns a two-dimensional array, each row is a place; and each row has
428
+ # one column: place name; for example:
429
+ #
430
+ # [["Austria"],
431
+ # ["Germany"],
432
+ # ["France (?)"]]
433
+ #
434
+ # @param [Nokogiri::XML:Node] xml a +<TEI>+ node
435
+ # @return [Array<Array>] an array of arrays of values
436
+ def extract_recon_places xml
437
+ xpath = '//origPlace/text()'
438
+ extract_normalized_strings(xml, xpath).map { |place| [place] }
439
+ end
440
+
441
+ #########################################################################
442
+ # Date of production
443
+ #########################################################################
444
+
445
+ # Extracts the date of production from the given TEI XML record as recorded.
446
+ #
447
+ # @param [Nokogiri::XML::Node] xml the TEI XML record
448
+ # @param [String] range_sep the separator for the date range
449
+ # @return [Array<String>] the extracted dates of production as recorded
450
+ def extract_production_date_as_recorded xml, range_sep: '-'
451
+ extract_date_range(xml, range_sep: range_sep)
452
+ end
453
+
454
+ # Extracts and formats date ranges as recorded in the given TEI XML record.
455
+ #
456
+ # @param [Nokogiri::XML::Node] record the TEI XML record
457
+ # @param [String] range_sep the separator for the date range
458
+ # @return [Array<String>] an array of formatted date ranges
459
+ def extract_date_range record, range_sep:
460
+ record.xpath('//origDate').map { |orig|
461
+ orig.xpath('@notBefore|@notAfter').map { |d| d.text.to_i }.sort.join(range_sep)
462
+ }
463
+ end
464
+
465
+ #########################################################################
466
+ # Titles
467
+ #########################################################################
468
+
469
+ ##
470
+ # Return an array of Title instances equal in number to
471
+ # the number of non-vernacular titles.
472
+ #
473
+ # This is a bit of a hack. Titles are list serially and Roman-
474
+ # character and vernacular script titles are not paired. Thus:
475
+ #
476
+ # <msItem>
477
+ # <title>Qaṭr al-nadā wa-ball al-ṣadā.</title>
478
+ # <title type="vernacular">قطر الندا وبل الصدا</title>
479
+ # <title>Second title</title>
480
+ # <author>
481
+ # <!-- ... -->
482
+ # </msItem>
483
+ #
484
+ # We assume that, when there is a vernacular title, it follows
485
+ # its Roman equivalent. This script runs through all +<title>+
486
+ # elements and creates a Title struct for each title where
487
+ #
488
+ # @type != 'vernacular'
489
+ #
490
+ # When +@type+ is 'vernacular' is sets the +as_recorded_agr+
491
+ # of the previous Title instance to that value.
492
+ #
493
+ # @param [Nokogiri::XML::Node] record the TEI record
494
+ # @return [Array<Title>]
495
+ def extract_titles record
496
+ titles = []
497
+ record.xpath('//msItem[1]/title').each do |title|
498
+ if title[:type] != 'vernacular'
499
+ titles << DS::Extractor::Title.new(
500
+ as_recorded: DS::Util.normalize_string(title.text)
501
+ )
502
+ else
503
+ titles.last.vernacular = DS::Util.normalize_string title.text
504
+ end
505
+ end
506
+ titles
507
+ end
508
+
509
+ # Extracts the titles from the given TEI record as recorded.
510
+ #
511
+ # @param [Nokogiri::XML::Node] record the TEI record
512
+ # @return [Array<String>] list of titles as recorded
513
+ def extract_titles_as_recorded record
514
+ extract_titles(record).map { |t| t.as_recorded }
515
+ end
516
+
517
+ # Extracts the titles from the given TEI record as recorded in the vernacular language.
518
+ #
519
+ # @param [Nokogiri::XML::Node] record the TEI record
520
+ # @return [Array<String>] list of titles in the vernacular language as recorded
521
+ def extract_titles_as_recorded_agr record
522
+ extract_titles(record).map { |t| t.vernacular }
523
+ end
524
+
525
+ # Extracts the titles from the given TEI record to an array of titles.
526
+ #
527
+ # @param [Nokogiri::XML::Node] xml the TEI record
528
+ # @return [Array<Array>] list of titles converted to arrays
529
+ def extract_recon_titles xml
530
+ extract_titles(xml).map { |t| t.to_a }
531
+ end
532
+
533
+ #########################################################################
534
+ # Physical description
535
+ #########################################################################
536
+ ##
537
+ # Return the extent and support concatenated; e.g.,
538
+ #
539
+ # @param [Nokogiri::XML::Node] xml the TEI xml
540
+ # @return [String]
541
+ def extract_physical_description xml
542
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/extent/text()'
543
+ extent = extract_normalized_strings(xml, xpath).first
544
+ extent = "Extent: #{extent}" unless extent.blank?
545
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/supportDesc/support/p/text()'
546
+ support = extract_normalized_strings(xml, xpath).first
547
+
548
+ desc = [extent, support].reject(&:blank?).join('; ').capitalize
549
+ [desc]
550
+ end
551
+
552
+ #########################################################################
553
+ # Notes
554
+ #########################################################################
555
+ SIMPLE_NOTE_XPATH = '/TEI/teiHeader/fileDesc/notesStmt/note[not(@type)]/text()'
556
+ BINDING_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/bindingDesc/binding/p/text()'
557
+ LAYOUT_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/objectDesc/layoutDesc/layout/text()'
558
+ SCRIPT_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/scriptDesc/scriptNote/text()'
559
+ DECO_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/physDesc/decoDesc/decoNote[not(@n)]/text()'
560
+ RESOURCE_XPATH = '/TEI/teiHeader/fileDesc/notesStmt/note[@type = "relatedResource"]/text()'
561
+ PROVENANCE_XPATH = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/history/provenance/text()'
562
+
563
+ ##
564
+ # Create an array of notes. Physical description notes, like
565
+ # Binding, and Layout are mapped as prefixed notes as with TEI:
566
+ #
567
+ # Binding: The binding note.
568
+ # Layout: The layout note.
569
+ #
570
+ # @param [Nokogiri::XML::Node] xml the TEI xml
571
+ # @return [Array<String>]
572
+ def extract_notes xml
573
+ notes = []
574
+
575
+ notes += build_notes xml, SIMPLE_NOTE_XPATH
576
+ notes += build_notes xml, BINDING_XPATH, prefix: "Binding"
577
+ notes += build_notes xml, LAYOUT_XPATH, prefix: "Layout"
578
+ notes += build_notes xml, SCRIPT_XPATH, prefix: "Script"
579
+ notes += build_notes xml, DECO_XPATH, prefix: "Decoration"
580
+ notes += build_notes xml, RESOURCE_XPATH, prefix: "Related resource"
581
+ notes += build_notes xml, PROVENANCE_XPATH, prefix: "Provenance"
582
+
583
+ notes
584
+ end
585
+
586
+ WHITESPACE_RE = %r{\s+}
587
+ MEDIAL_PIPE_RE = %r{\s*\|\s*} # match pipes
588
+
589
+ ##
590
+ # Clean the note text and optionally a prefix. The prefix is
591
+ # prepended as:
592
+ #
593
+ # "#{prefix}: Note text"
594
+ #
595
+ # @param [Nokogiri::XML::Node] xml the TEI xml
596
+ # @param [String] xpath the xpath for the note(s)
597
+ # @param [String] prefix value to prepend to the note; default: +nil+
598
+ # @return [Array<String>]
599
+ def build_notes xml, xpath, prefix: nil
600
+ pref = prefix.blank? ? '' : "#{prefix}: "
601
+ extract_normalized_strings(xml, xpath).map { |value|
602
+ "#{pref}#{value}"
603
+ }
604
+ end
605
+
606
+ #########################################################################
607
+ # Holding information
608
+ #########################################################################
609
+
610
+ # Extracts the holding institution from the given record.
611
+ #
612
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
613
+ # @return [String] the extracted holding institution
614
+ def extract_holding_institution record
615
+ xpath = '(//msIdentifier/institution|//msIdentifier/repository)[1]'
616
+ extract_normalized_strings(record, xpath).first
617
+ end
618
+
619
+ # Extracts the holding institution id number from the given record.
620
+ #
621
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
622
+ # @return [String] the extracted holding institution id number
623
+ def extract_holding_institution_id_nummber record
624
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msIdentifier/altIdentifier[@type="bibid"]/idno'
625
+ extract_normalized_strings(record, xpath).first
626
+ end
627
+
628
+ # Extracts the shelfmark from the given record.
629
+ #
630
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
631
+ # @return [String] the extracted shelfmark
632
+ def extract_shelfmark record
633
+ xpath = '/TEI/teiHeader/fileDesc/sourceDesc/msDesc/msIdentifier/idno[@type="call-number"]'
634
+ extract_normalized_strings(record, xpath).first
635
+ end
636
+
637
+ # Extracts the link to the record from the given record.
638
+ #
639
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
640
+ # @return [String] the extracted link to the record
641
+ def extract_link_to_record record
642
+ xpath = '//altIdentifier[@type="resource"][1]/idno'
643
+ extract_normalized_strings(record, xpath).first
644
+ end
645
+
646
+ #########################################################################
647
+ # Acknowledgments
648
+ #########################################################################
649
+
650
+ # Extracts the funder information from the TEI XML record.
651
+ #
652
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
653
+ # @return [Array<String>] an array of funders extracted from the record
654
+ def extract_funder record
655
+ xpath = '/TEI/teiHeader/fileDesc/titleStmt/funder'
656
+ extract_normalized_strings(record, xpath).map { |name| "Funder: #{name}" }
657
+ end
658
+
659
+ # Extracts acknowledgments from the TEI XML record.
660
+ #
661
+ # @param [Nokogiri::XML::Node] record the TEI xml representing a TEI XML record
662
+ # @return [Array<String>] an array of acknowledgments extracted from the record
663
+ def extract_acknowledgments record
664
+ names = extract_resps(record, *ACKNOWLEDGMENT_RESPS).map { |name|
665
+ "#{name.role.capitalize}: #{name.as_recorded}"
666
+ }
667
+ names + extract_funder(record)
668
+ end
669
+
670
+ #########################################################################
671
+ # Utility methods
672
+ #########################################################################
673
+
674
+ # Extracts normalized strings from the given record based on the provided xpath.
675
+ #
676
+ # @param [Nokogiri::XML::Node] record the record to extract normalized strings from
677
+ # @param [String] xpath the xpath to specify the location of the strings in the record
678
+ # @return [Array<String>] an array of normalized strings extracted from the record
679
+ def extract_normalized_strings record, xpath
680
+ record.xpath(xpath).map { |node| DS::Util.normalize_string node.text }
681
+ end
682
+ end
683
+
684
+ self.extend ClassMethods
685
+ end
686
+ end
687
+ end