ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,695 @@
1
+ require 'csv'
2
+
3
+ module DS
4
+ module Extractor
5
+ module DsCsvExtractor
6
+ COLUMN_MAPPINGS = {
7
+ ds_id: "DS ID",
8
+ holding_institution_as_recorded: "Holding Institution",
9
+ source_type: "Source Type",
10
+ cataloging_convention: "Cataloging Convention",
11
+ holding_institution_id_number: "Holding Institution Identifier",
12
+ holding_institution_shelfmark: "Shelfmark",
13
+ fragment_num_disambiguator: "Fragment Number or Disambiguator",
14
+ link_to_holding_institution_record: "Link to Institutional Record",
15
+ link_to_iiif_manifest: "IIIF Manifest",
16
+ production_places_as_recorded: "Production Place(s)",
17
+ production_date_as_recorded: "Date Description",
18
+ production_date_start: "Production Date START",
19
+ production_date_end: "Production Date END",
20
+ dated: "Dated",
21
+ uniform_titles_as_recorded: "Uniform Title(s)",
22
+ titles_as_recorded: "Title(s)",
23
+ genres_as_recorded: "Genre/Form",
24
+ all_subjects: [
25
+ "Subject(s)",
26
+ "Named Subject(s)",
27
+ ],
28
+ subjects_as_recorded: "Subject(s)",
29
+ named_subjects_as_recorded: "Named Subject(s)",
30
+ authors_as_recorded: "Author Name(s)",
31
+ artists_as_recorded: "Artist Name(s)",
32
+ scribes_as_recorded: "Scribe Name(s)",
33
+ former_owners_as_recorded: "Former Owner Name(s)",
34
+ languages_as_recorded: "Language(s)",
35
+ material_as_recorded: "Materials Description",
36
+ extent: "Extent",
37
+ dimensions: "Dimensions",
38
+ notes: [
39
+ "Layout",
40
+ "Script",
41
+ "Decoration",
42
+ "Binding",
43
+ "Physical Description Miscellaneous",
44
+ "Provenance Notes",
45
+ "Note 1",
46
+ "Note 2"
47
+ ],
48
+ acknowledgments: "Acknowledgements",
49
+ date_source_modified: "Date Updated by Contributor",
50
+ }.freeze
51
+
52
+ LONG_STRING_WARNING = 'TEXT_EXCEEDS_400_CHARACTERS'
53
+
54
+ module ClassMethods
55
+
56
+ # Extracts the DSID value from the given record.
57
+ #
58
+ # @param [CSV::Row] record the record to extract the DSID from
59
+ # @return [String] the extracted DSID value
60
+ def extract_dsid record
61
+ [extract_values_for(property: :ds_id, record: record)].flatten.first
62
+ end
63
+
64
+ # Extracts the source type value from the given record.
65
+ #
66
+ # @param [CSV::Row] record the record to extract the source type from
67
+ # @return [String] the extracted source type value
68
+ def extract_source_type record
69
+ extract_values_for(property: :source_type, record: record).first
70
+ end
71
+
72
+ # Extracts the cataloging convention value from the given record.
73
+ #
74
+ # @param [CSV::Row] record the record to extract the cataloging convention from
75
+ # @return [String] the extracted cataloging convention value
76
+ def extract_cataloging_convention record
77
+ extract_values_for(property: :cataloging_convention, record: record).first
78
+ end
79
+
80
+ # Extracts the cataloging convention value from the given record.
81
+ #
82
+ # @param [CSV::Row] record the record to extract the cataloging convention from
83
+ # @return [String] the extracted cataloging convention value
84
+ def extract_holding_institution_as_recorded record
85
+ extract_values_for(property: :holding_institution_as_recorded, record: record).first
86
+ end
87
+
88
+ # Extracts the institutional identifier (e.g., BibID) from the given record.
89
+ #
90
+ # @param [CSV::Row] record the record to extract the cataloging convention from
91
+ # @return [String] the institutional identifier for the manuscript
92
+ def extract_holding_institution_id_number record
93
+ extract_values_for(property: :holding_institution_id_number, record: record).first
94
+ end
95
+
96
+ # Extracts the holding institution shelfmark from the given record.
97
+ #
98
+ # @param [CSV::Row] record the record to extract the holding institution shelfmark from
99
+ # @return [String] the extracted holding institution shelfmark value
100
+ def extract_holding_institution_shelfmark record
101
+ extract_values_for(property: :holding_institution_shelfmark, record: record).first
102
+ end
103
+
104
+ # Extracts the fragment number or disambiguator value from the given record.
105
+ #
106
+ # @param [CSV::Row] record the record to extract the fragment number or disambiguator from
107
+ # @return [String] the extracted fragment number or disambiguator value
108
+ def extract_fragment_num_disambiguator record
109
+ extract_values_for(property: :fragment_num_disambiguator, record: record).first
110
+ end
111
+
112
+ # Extracts the link to the holding institution record from the given record.
113
+ #
114
+ # @param [CSV::Row] record the record to extract the link from
115
+ # @return [String] the extracted link to the holding institution record
116
+ def extract_link_to_holding_institution_record record
117
+ extract_values_for(property: :link_to_holding_institution_record, record: record).first
118
+ end
119
+
120
+ # Extracts the link to the IIIF manifest from the given record.
121
+ #
122
+ # @param [CSV::Row] record the record to extract the link from
123
+ # @return [String] the extracted link to the IIIF manifest
124
+ def extract_link_to_iiif_manifest record
125
+ extract_values_for(property: :link_to_iiif_manifest, record: record).first
126
+ end
127
+
128
+ # Extracts the production date as recorded value from the given record.
129
+ #
130
+ # @param [CSV::Row] record the record to extract the production date from
131
+ # @return [Array<String>] the extracted production dates
132
+ def extract_production_date_as_recorded record
133
+ dar = extract_values_for(property: :production_date_as_recorded, record: record)
134
+ return dar if dar.present?
135
+
136
+ extract_date_range record, range_sep: '-'
137
+ end
138
+
139
+
140
+ # Extracts the date range from the given record using the specified separator.
141
+ #
142
+ # @param [CSV::Row] record the record to extract the date range from
143
+ # @param [String] range_sep the separator to be used in the date range
144
+ # @return [Array<String>] the extracted date range
145
+ def extract_date_range record, range_sep:
146
+ start_date = extract_production_date_start record
147
+ end_date = extract_production_date_end record
148
+ range = [start_date, end_date].select(&:present?)
149
+ return [] if range.blank?
150
+ [range.join(range_sep)]
151
+ end
152
+
153
+ # Extracts the production date start value from the given record.
154
+ #
155
+ # @param [CSV::Row] record the record to extract the production date start from
156
+ # @return [String] the extracted production date start value
157
+ def extract_production_date_start record
158
+ extract_values_for(property: :production_date_start, record: record).first
159
+ end
160
+
161
+ # Extracts the production date end value from the given record.
162
+ #
163
+ # @param [CSV::Row] record the record to extract the production date end from
164
+ # @return [String] the extracted production date end value
165
+ def extract_production_date_end record
166
+ extract_values_for(property: :production_date_end, record: record).first
167
+ end
168
+
169
+ # Extracts the dated value from the given record.
170
+ #
171
+ # @param [CSV::Row] record the record to extract the dated value from
172
+ # @return [Boolean] true if the dated value is 'true', false otherwise
173
+ def extract_dated record
174
+ dated = extract_values_for(property: :dated, record: record)
175
+ return true if dated.join.strip.downcase == 'true'
176
+ end
177
+
178
+ # @todo implement extract_names
179
+ # Extracts the physical description from the given record.
180
+ #
181
+ # @param [CSV::Row] record the record to extract the physical description from
182
+ # @return [Array<String>] the extracted physical description
183
+ def extract_physical_description record
184
+ extent = extract_values_for property: :extent, record: record
185
+ material = extract_values_for property: :material_as_recorded, record: record
186
+ dimensions = extract_dimensions record
187
+ desc = [extent, material, dimensions].flatten
188
+
189
+ # return an empty array if no values are present
190
+ return [] unless desc.any?(&:present?)
191
+
192
+ ["Extent: #{desc.join '; '}"]
193
+ end
194
+
195
+ # Extracts the dimensions from the given record.
196
+ #
197
+ # @param [CSV::Row] record the record to extract the dimensions from
198
+ # @return [Array<String>] the extracted dimensions
199
+ def extract_dimensions record
200
+ extract_values_for property: :dimensions, record: record
201
+ end
202
+
203
+ # Extracts authors as recorded from the given record.
204
+ #
205
+ # @param [CSV::Row] record the record to extract authors from
206
+ # @return [Array<String>] the extracted authors as recorded
207
+ def extract_authors_as_recorded record
208
+ extract_authors(record).map &:as_recorded
209
+ end
210
+
211
+ # Extracts authors as recorded with vernacular form from the given record.
212
+ #
213
+ # @param [CSV::Row] record the record to extract authors from
214
+ # @return [Array<String>] the extracted authors as recorded with vernacular form
215
+ def extract_authors_as_recorded_agr record
216
+ extract_authors(record).map &:vernacular
217
+ end
218
+
219
+ # Extracts authors from the given record using the specified type and role.
220
+ #
221
+ # @param [CSV::Row] record the record to extract authors from
222
+ # @return [Array<String>] the extracted authors
223
+ def extract_authors record
224
+ extract_names(record, :authors_as_recorded, 'author')
225
+ end
226
+
227
+ # Extracts artists as recorded from the given record.
228
+ #
229
+ # @param [CSV::Row] record the record to extract artists from
230
+ # @return [Array<String>] the extracted artists as recorded
231
+ def extract_artists_as_recorded record
232
+ extract_artists(record).map &:as_recorded
233
+ end
234
+
235
+ # Extracts artists as recorded with vernacular form from the given record.
236
+ #
237
+ # @param [CSV::Row] record the record to extract artists from
238
+ # @return [Array<String>] the extracted artists as recorded with vernacular form
239
+ def extract_artists_as_recorded_agr record
240
+ extract_artists(record).map &:vernacular
241
+ end
242
+
243
+ # Extracts artists from the given record using the specified type and role.
244
+ #
245
+ # @param [CSV::Row] record the record to extract artists from
246
+ # @return [Array<String>] the extracted artists
247
+ def extract_artists record
248
+ extract_names(record, :artists_as_recorded, 'artist')
249
+ end
250
+
251
+ # Extracts scribes as recorded from the given record.
252
+ #
253
+ # @param [CSV::Row] record the record to extract scribes from
254
+ # @return [Array<String>] the extracted scribes as recorded
255
+ def extract_scribes_as_recorded record
256
+ extract_scribes(record).map &:as_recorded
257
+ end
258
+
259
+ # Extracts scribes as recorded with vernacular form from the given record.
260
+ #
261
+ # @param [CSV::Row] record the record to extract scribes from
262
+ # @return [Array<String>] the extracted scribes as recorded with vernacular form
263
+ def extract_scribes_as_recorded_agr record
264
+ extract_scribes(record).map &:vernacular
265
+ end
266
+
267
+ # Extracts scribes from the given record using the specified type and role.
268
+ #
269
+ # @param [CSV::Row] record the record to extract scribes from
270
+ # @return [Array<String>] the extracted scribes
271
+ def extract_scribes record
272
+ extract_names(record, :scribes_as_recorded, 'scribe')
273
+ end
274
+
275
+ # Extracts former owners as recorded from the given record.
276
+ #
277
+ # @param [CSV::Row] record the record to extract former owners from
278
+ # @return [Array<String>] the extracted former owners as recorded
279
+ def extract_former_owners_as_recorded record
280
+ extract_former_owners(record).map &:as_recorded
281
+ end
282
+
283
+ # Extracts former owners as recorded with vernacular form from the given record.
284
+ #
285
+ # @param [CSV::Row] record the record to extract former owners from
286
+ # @return [Array<String>] the extracted former owners as recorded with vernacular form
287
+ def extract_former_owners_as_recorded_agr record
288
+ extract_former_owners(record).map &:vernacular
289
+ end
290
+
291
+ # Extracts former owners from the given record using the specified type and role.
292
+ #
293
+ # @param [CSV::Row] record the record to extract former owners from
294
+ # @return [Array<String>] the extracted former owners
295
+ def extract_former_owners record
296
+ extract_names(record, :former_owners_as_recorded, 'former owner')
297
+ end
298
+
299
+ # Extracts associated agents from the given record.
300
+ #
301
+ # @note Method to fulfill DS::Extractor contract; returns an empty array
302
+ #
303
+ # @param [CSV::Row] record the record
304
+ # @return [Array<String>] an empty array
305
+ def extract_associated_agents record
306
+ []
307
+ end
308
+
309
+ # Extracts languages as recorded from the given record.
310
+ #
311
+ # @param [CSV::Row] record the record to extract languages from
312
+ # @return [Array<String>] the extracted languages as recorded
313
+ def extract_languages_as_recorded record
314
+ extract_languages(record).map &:as_recorded
315
+ end
316
+
317
+ # Extracts languages from the given record using the specified type and role.
318
+ #
319
+ # @param [CSV::Row] record the record to extract languages from
320
+ # @return [Array<DS::Extractor::Language>] the extracted languages
321
+ def extract_languages record
322
+ extract_values_for(property: :languages_as_recorded, record: record).map { |lang|
323
+ DS::Extractor::Language.new as_recorded: lang
324
+ }
325
+ end
326
+
327
+ # Extracts material as recorded from the given record.
328
+ #
329
+ # @param [CSV::Row] record the record to extract material from
330
+ # @return [String, nil] the extracted material as recorded
331
+ def extract_material_as_recorded record
332
+ extract_materials(record).map(&:as_recorded).join '|'
333
+ end
334
+
335
+ # Extracts materials from the given record.
336
+ #
337
+ # @param [CSV::Row] record the record to extract materials from
338
+ # @return [Array<DS::Extractor::Material>] the extracted materials
339
+ def extract_materials record
340
+ extract_values_for(property: :material_as_recorded, record: record).map { |as_recorded|
341
+ DS::Extractor::Material.new as_recorded: as_recorded
342
+ }
343
+ end
344
+
345
+ # Extracts titles as recorded from the given record.
346
+ #
347
+ # @param [CSV::Row] record the record to extract titles from
348
+ # @return [Array<String>] the extracted titles as recorded
349
+ def extract_titles_as_recorded record
350
+ extract_titles(record).map &:as_recorded
351
+ end
352
+
353
+ # Extracts titles as recorded with vernacular form from the given record.
354
+ #
355
+ # @param [CSV::Row] record the record to extract titles from
356
+ # @return [Array<String>] the extracted titles as recorded with vernacular form
357
+ def extract_titles_as_recorded_agr record
358
+ extract_titles(record).map &:vernacular
359
+ end
360
+
361
+ # Extracts uniform titles as recorded from the given record.
362
+ #
363
+ # @param [CSV::Row] record the record to extract uniform titles from
364
+ # @return [Array<String>] the extracted uniform titles as recorded
365
+ def extract_uniform_titles_as_recorded record
366
+ extract_uniform_titles(record).map &:uniform_title
367
+ end
368
+
369
+ # Extracts uniform titles as recorded with vernacular form from the given record.
370
+ #
371
+ # @param [CSV::Row] record the record to extract uniform titles from
372
+ # @return [Array<String>] the extracted uniform titles as recorded with vernacular form
373
+ def extract_uniform_titles_as_recorded_agr record
374
+ extract_uniform_titles(record).map &:uniform_title_vernacular
375
+ end
376
+
377
+ ##
378
+ # Return titles as an array of DS::Extractor::Title instances.
379
+ # Title as recorded and vernacular values are in single columns:
380
+ #
381
+ # Uniform Title(s)
382
+ # Al-Hajj;;الجزء التاسع
383
+ #
384
+ # Titles are divided by pipe characters and as recorded and
385
+ # vernacular forms of a title are separated by double semicolons:
386
+ # +;;+.
387
+ #
388
+ # @param [CSV::Row] record a CSV row with headers
389
+ # @return [Array<DS::Extractor::Title>] the names a list
390
+ def extract_titles record
391
+ as_recorded_titles = extract_values_for(property: :titles_as_recorded, record: record)
392
+ uniform_titles = extract_values_for(property: :uniform_titles_as_recorded, record: record)
393
+ as_recorded_titles << '' if as_recorded_titles.blank?
394
+
395
+ unless balanced_titles? as_recorded_titles, uniform_titles
396
+ raise ArgumentError, "Unbalanced number of titles and uniform titles (titles: #{as_recorded_titles.inspect}, uniform titles: #{uniform_titles.inspect})"
397
+ end
398
+
399
+ as_recorded_titles.zip(uniform_titles).map { |as_rec, uniform|
400
+ as_recorded, vernacular = as_rec.split ';;', 2
401
+ uniform_title, uniform_title_vernacular = uniform.to_s.split ';;', 2
402
+ DS::Extractor::Title.new(
403
+ as_recorded: as_recorded,
404
+ vernacular: vernacular,
405
+ uniform_title: uniform_title,
406
+ uniform_title_vernacular: uniform_title_vernacular
407
+ )
408
+ }
409
+ end
410
+
411
+ # Return true if the as_recorded and uniform titles are of equal length.
412
+ #
413
+ # @param [Array<String>] as_recorded_titles
414
+ # @param [Array<String>] uniform_titles
415
+ # @return [Boolean]
416
+ def balanced_titles? as_recorded_titles, uniform_titles
417
+ return true if uniform_titles.blank?
418
+
419
+ as_recorded_titles.size == uniform_titles.size
420
+ end
421
+
422
+ ##
423
+ # Note: BaseTerm implementations require +as_recorded+; for DS
424
+ # CSV we don't assume that the Title(s) and Uniform Titles(s)
425
+ # are paralleled so they're handled separately.
426
+ #
427
+ # @todo: Find out whether we should enforce that Titles and
428
+ # Uniform Titles be evenly paired.
429
+ # Extracts uniform titles from the given record.
430
+ #
431
+ # @param [CSV::Row] record the record to extract uniform titles from
432
+ # @return [Array<DS::Extractor::Title>] the extracted uniform titles
433
+ def extract_uniform_titles record
434
+ extract_values_for(property: :uniform_titles_as_recorded, record: record).map { |title|
435
+ as_recorded, vernacular = title.to_s.split ';;', 2
436
+ # BaseTerm implementations require +as_recorded+; for DS CSV
437
+ # we don't assume that the Title(s) and Uniform Titles(s)
438
+ # are paralleled so there handled separately
439
+ DS::Extractor::Title.new as_recorded: nil, uniform_title: as_recorded, uniform_title_vernacular: vernacular
440
+ }
441
+ end
442
+
443
+ ##
444
+ # Return names as an array DS::Extractor::Name instances. Name
445
+ # as recorded and vernacular values are in single columns:
446
+ #
447
+ # Author Name(s)
448
+ # An author;;An author in original script|Another author
449
+ #
450
+ # Names are divided by pipe characters and as recorded and
451
+ # vernacular forms of a name are separated by double semicolons:
452
+ # +;;+.
453
+ #
454
+ # @param [CSV::Row] record a CSV row with headers
455
+ # @param [Symbol] property a valid property name; e.g., +:artist_as_recorded+
456
+ # @param [String] role the name role; e.g., +artist+
457
+ # @return [Array<DS::Extractor::Name>] the names a list
458
+ def extract_names record, property, role
459
+ extract_values_for(property: property, record: record).map { |name|
460
+ as_recorded, vernacular = name.to_s.split ';;', 2
461
+ DS::Extractor::Name.new as_recorded: as_recorded, vernacular: vernacular, role: role
462
+ }
463
+ end
464
+
465
+ # Extracts production places as recorded from the given record.
466
+ #
467
+ # @param [CSV::Row] record the record to extract production places from
468
+ # @return [Array<String>] the extracted production places as recorded
469
+ def extract_production_places_as_recorded record
470
+ extract_places(record, :production_places_as_recorded).map &:as_recorded
471
+ end
472
+
473
+ # Extracts places from the given record using the specified property.
474
+ #
475
+ # @param [Symbol] property the property to extract places from the record
476
+ # @param [CSV::Row] record the record to extract places from
477
+ # @return [Array<DS::Extractor::Place>] the extracted places
478
+ def extract_places record, property = :production_places_as_recorded
479
+ extract_values_for(property: property, record: record).map { |place|
480
+ DS::Extractor::Place.new as_recorded: place
481
+ }
482
+ end
483
+
484
+ # Extracts genres as recorded from the given record.
485
+ #
486
+ # @param [CSV::Row] record the record to extract genres from
487
+ # @return [Array<String>] the extracted genres as recorded
488
+ def extract_genres_as_recorded record
489
+ extract_genres(record).map &:as_recorded
490
+ end
491
+
492
+ # Extracts genres from the given record.
493
+ #
494
+ # @param [CSV::Row] record the record to extract genres from
495
+ # @return [Array<DS::Extractor::Genre>] the extracted genres
496
+ def extract_genres record
497
+ extract_terms record, :genres_as_recorded, DS::Extractor::Genre, vocab: 'ds-genre'
498
+ end
499
+
500
+ # Extracts subjects as recorded from the given record.
501
+ #
502
+ # @param [CSV::Row] record the record to extract subjects from
503
+ # @return [Array<String>] the extracted subjects as recorded
504
+ def extract_subjects_as_recorded record
505
+ extract_subjects(record).map &:as_recorded
506
+ end
507
+
508
+ # Extracts all subjects as recorded from the given record.
509
+ #
510
+ # @param [CSV::Row] record the record to extract all subjects from
511
+ # @return [Array<String>] the extracted all subjects as recorded
512
+ def extract_all_subjects_as_recorded record
513
+ extract_all_subjects(record).map &:as_recorded
514
+ end
515
+
516
+ # Extracts all subjects from the given record, including subjects and named subjects.
517
+ #
518
+ # @param [CSV::Row] record the record to extract all subjects from
519
+ # @return [Array<DS::Extractor::Subject>] the extracted all subjects
520
+ def extract_all_subjects record
521
+ extract_subjects(record) + extract_named_subjects(record)
522
+ end
523
+
524
+ # Extracts subjects from the given record.
525
+ #
526
+ # @param [CSV::Row] record the record to extract subjects from
527
+ # @return [Array<DS::Extractor::Subject>] the extracted subjects
528
+ def extract_subjects record
529
+ extract_terms record, :subjects_as_recorded, DS::Extractor::Subject, vocab: 'ds-subject'
530
+ end
531
+
532
+ # Extracts named subjects as recorded from the given record.
533
+ #
534
+ # @param [CSV::Row] record the record to extract named subjects from
535
+ # @return [Array<String>] the extracted named subjects as recorded
536
+ def extract_named_subjects_as_recorded record
537
+ extract_named_subjects(record).map &:as_recorded
538
+ end
539
+
540
+ # Extracts named subjects from the given record.
541
+ #
542
+ # @param [CSV::Row] record the record to extract named subjects from
543
+ # @return [Array<DS::Extractor::Subject>] the extracted named subjects
544
+ def extract_named_subjects record
545
+ extract_terms record, :named_subjects_as_recorded, DS::Extractor::Subject, vocab: 'ds-subject'
546
+ end
547
+
548
+ # Extracts terms of a specific type from the given record using the specified property.
549
+ #
550
+ # @param [CSV::Row] record the record to extract terms from
551
+ # @param [Symbol] property the property to extract terms from the record
552
+ # @param [Class] term_type the type of terms to extract
553
+ # @return [Array<term_type>] the extracted terms
554
+ def extract_terms record, property, term_type, vocab: nil
555
+ extract_values_for(property: property, record: record).map { |term|
556
+ term_type.new as_recorded: term, vocab: vocab
557
+ }
558
+ end
559
+
560
+ # Extracts acknowledgments from the given record.
561
+ #
562
+ # @param [CSV::Row] record the record to extract acknowledgments from
563
+ # @return [Array] the extracted acknowledgments
564
+ def extract_acknowledgments record
565
+ extract_values_for property: :acknowledgments, record: record
566
+ end
567
+
568
+ # Extracts reconstructed places from the given record.
569
+ #
570
+ # @param [CSV::Row] record the record to extract reconstructed places from
571
+ # @return [Array] the extracted reconstructed places
572
+ def extract_recon_places record
573
+ extract_places(record, :production_places_as_recorded).map &:to_a
574
+ end
575
+
576
+ # Extracts reconstructed titles from the given record.
577
+ #
578
+ # @param [CSV::Row] record the record to extract reconstructed titles from
579
+ # @return [Array] the extracted reconstructed titles
580
+ def extract_recon_titles record
581
+ extract_titles(record).map &:to_a
582
+ end
583
+
584
+ # Extracts reconstructed subjects from the given record.
585
+ #
586
+ # @param [CSV::Row] record the record to extract reconstructed subjects from
587
+ # @return [Array] the extracted reconstructed subjects
588
+ def extract_recon_subjects record
589
+ extract_all_subjects(record).map &:to_a
590
+ end
591
+
592
+ # Extracts reconstructed genres from the given record.
593
+ #
594
+ # @param [CSV::Row] record the record to extract reconstructed genres from
595
+ # @return [Array] the extracted reconstructed genres
596
+ def extract_recon_genres record
597
+ extract_genres(record).map &:to_a
598
+ end
599
+
600
+ # @todo implement extract_recon_names
601
+ def extract_recon_names record
602
+ names = []
603
+ names += extract_names(record, :authors_as_recorded, 'author').map(&:to_a)
604
+ names += extract_names(record, :artists_as_recorded, 'artist').map(&:to_a)
605
+ names += extract_names(record, :scribes_as_recorded, 'scribe').map(&:to_a)
606
+ names += extract_names(record, :former_owners_as_recorded, 'former owner').map(&:to_a)
607
+ names
608
+ end
609
+
610
+ # Extracts values for a specific property from a record.
611
+ #
612
+ # @param [Symbol] property the property to extract values for
613
+ # @param [CSV::Row] record the record containing the values
614
+ # @return [Array] the extracted values
615
+ def extract_values_for property:, record:
616
+ raise "Unknown property: #{property}" unless known_property? property
617
+ columns = [COLUMN_MAPPINGS[property.to_sym]].flatten
618
+ columns.filter_map { |header|
619
+ extract_values_for_header header: header, record: record
620
+ }.flatten.map { |s| mark_long s}
621
+ end
622
+
623
+ # Extracts the values for a specific header from a record, splitting on '|' and stripping whitespace.
624
+ #
625
+ # @param [CSV::Row] record the record containing the values
626
+ # @param [String] header the header to extract values for
627
+ # @return [Array<String>] the extracted values
628
+ def extract_values_for_header header:, record:
629
+ return unless record[header].present?
630
+
631
+ # use split -1 to preserve empty values
632
+ record[header].to_s.split('|', -1).map(&:strip)
633
+ end
634
+
635
+ # Determines if a method name maps to a property.
636
+ #
637
+ # @param [String] method_name the method name to check
638
+ # @return [Boolean] true if the method name corresponds to a known property, false otherwise
639
+ def maps_to_property? method_name
640
+ prop_name = get_property_name method_name
641
+ return unless prop_name
642
+ known_property? prop_name
643
+ end
644
+
645
+ # Determines if a property is known.
646
+ #
647
+ # @param [Symbol] property the property to check if it is known
648
+ # @return [Boolean] true if the property is known, false otherwise
649
+ def known_property? property
650
+ COLUMN_MAPPINGS.include? property.to_sym
651
+ end
652
+
653
+ # Determines the property name extracted from the method name.
654
+ #
655
+ # @param [String] method_name the method name to extract the property name from
656
+ # @return [String, nil] the extracted property name or nil if not found
657
+ def get_property_name method_name
658
+ return unless method_name.to_s =~ /^extract_\w+/
659
+ method_name.to_s.split(/_/, 2).last
660
+ end
661
+
662
+ # Extracts notes from the given record.
663
+ #
664
+ # @param [CSV::Row] record the record to extract notes from
665
+ # @return [Array<String>] the extracted notes
666
+ def extract_notes record
667
+ notes = COLUMN_MAPPINGS[:notes].filter_map { |header|
668
+ vals = extract_values_for_header header: header, record: record
669
+ next unless vals
670
+
671
+ case header
672
+ when /^(Note|Physical description)/i
673
+ vals
674
+ when /^Provenance/
675
+ vals.map { |v| "Provenance: #{v}" }
676
+ else
677
+ vals.map { |v| "#{header}: #{v}" }
678
+ end
679
+ }.flatten.map { |s| mark_long s }
680
+ notes
681
+ end
682
+
683
+ def mark_long s
684
+ return s if s.blank?
685
+ return s if s.length <= 400
686
+
687
+ "#{LONG_STRING_WARNING}: #{s}"
688
+ end
689
+
690
+ end
691
+
692
+ self.extend ClassMethods
693
+ end
694
+ end
695
+ end