ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,256 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'uri'
5
+ require 'date'
6
+ require_relative './constants'
7
+
8
+ module DS
9
+ module Manifest
10
+ ##
11
+ # Validate a DS input manifest.
12
+ #
13
+ # Validation does the following:
14
+ #
15
+ # - Confirms all required columns are present
16
+ # - Confirms all all required values are present
17
+ # - Confirms all column values are the correct type
18
+ # - Confirms all listed input files are present
19
+ # - Confirms all listed input files match the record
20
+ # identifier provided in the manifest
21
+ #
22
+ # @todo Add test for live URLs
23
+ class ManifestValidator
24
+ include DS::Manifest::Constants
25
+
26
+ attr_reader :manifest
27
+ attr_reader :source_dir
28
+ attr_reader :errors
29
+
30
+ URI_REGEXP = URI::DEFAULT_PARSER.make_regexp %w{http https}
31
+ QID_REGEXP = %r{\AQ\d+\z}
32
+
33
+ ##
34
+ # @param [DS::Manifest] manifest DS::Manifest instance
35
+ # @return [DS::ManifestValidator]
36
+ def initialize manifest
37
+ @manifest = manifest
38
+ @errors = []
39
+ @id_validators = {}
40
+ end
41
+
42
+ ##
43
+ # @return [boolean] true if the manifest is valid
44
+ def valid?
45
+ return false unless validate_columns
46
+ return false unless validate_required_values
47
+ return false unless validate_ids_unique
48
+ return false unless validate_data_types
49
+ return false unless validate_files_exist
50
+ return false unless validate_records_present
51
+ true
52
+ end
53
+
54
+ ##
55
+ # @return [boolean] true if all required columns are present
56
+ def validate_columns
57
+ found_columns = manifest.headers
58
+ diff = MANIFEST_COLUMNS - found_columns
59
+ return true if diff.blank?
60
+ add_error "Manifest missing required columns: #{diff.join ', '}" if diff.present?
61
+ false
62
+ end
63
+
64
+ ##
65
+ # @return [boolean] true if all require values present
66
+ def validate_required_values
67
+ is_valid = true
68
+ manifest.each_with_index do |row, ndx|
69
+ REQUIRED_VALUES.each do |col|
70
+ if row[col].blank?
71
+ add_error "Required value missing in row: #{ndx+1}, col.: #{col}"
72
+ is_valid = false
73
+ end
74
+ end
75
+ end
76
+ is_valid
77
+ end
78
+
79
+ ##
80
+ # @return [boolean] true if all data types are valid
81
+ def validate_data_types
82
+ is_valid = true
83
+ manifest.each_with_index do |entry, row_num|
84
+ is_valid = false unless validate_urls entry, row_num
85
+ is_valid = false unless validate_qids entry, row_num
86
+ is_valid = false unless validate_dates entry, row_num
87
+ end
88
+ is_valid
89
+ end
90
+
91
+ ##
92
+ # @return [boolean] true if all list input files are present
93
+ def validate_files_exist
94
+ is_valid = true
95
+ manifest.each_with_index do |entry, row_num|
96
+ file_path = File.join manifest.source_dir, entry.filename
97
+ unless File.exist? file_path
98
+ is_valid = false
99
+ add_error "Source file not found row: #{row_num+1}; source directory: #{source_dir}; file: #{entry.filename}"
100
+ end
101
+ end
102
+ is_valid
103
+ end
104
+
105
+ # Validates the uniqueness of all IDs in the manifest.
106
+ #
107
+ # This method collects the count of all IDs in the manifest and selects those with a count greater than 1.
108
+ # It then iterates over the multiples and adds an error for each duplicate ID found.
109
+ #
110
+ # Returns:
111
+ # - `true` if no duplicate IDs are found.
112
+ # - `false` if duplicate IDs are found.
113
+ def validate_ids_unique
114
+ # collect the count of all ids and select those with a count > 1
115
+ multiples = manifest.inject({}) { |h, id|
116
+ h[id] ||= 0; h[id] += 1; h
117
+ }.filter_map { |id, count|
118
+ [id, count] if count > 1
119
+ }
120
+
121
+ return true if multiples.blank?
122
+
123
+ multiples.each do |id, count|
124
+ add_error "Duplicate ID found in manifest: ID '#{id}' found in #{count} rows"
125
+ end
126
+ false
127
+ end
128
+
129
+ ##
130
+ # @return [boolean] true if all +holding_institution_institutional_id+
131
+ # values match source file
132
+ def validate_records_present
133
+ is_valid = true
134
+ manifest.each_with_index do |entry, row_num|
135
+ file_path = File.join manifest.source_dir, entry.filename
136
+
137
+ inst_id = entry.institutional_id
138
+ id_validator = get_id_validator entry.source_type
139
+ found = id_validator.valid? file_path, inst_id, entry.institutional_id_location_in_source
140
+
141
+ unless found
142
+ is_valid = false
143
+ id_validator.errors.each { |error| add_error error }
144
+ end
145
+ end
146
+ is_valid
147
+ end
148
+
149
+ # Handles the error when the count of records found for a given `inst_id` and `location_in_source` is
150
+ # 0 or more than 1.
151
+ #
152
+ # @param count [Integer] the number of records found for the given `inst_id` and `location_in_source`
153
+ # @param inst_id [String] the identifier of the record
154
+ # @param location_in_source [String] the location in the source where the record is found
155
+ # @return [nil]
156
+ def handle_count_error count, inst_id, location_in_source
157
+ return if count == 1
158
+
159
+ if count > 1
160
+ add_error "ERROR: Multiple records (#{count}) found for id: #{inst_id} (location: #{location_in_source})"
161
+ elsif count == 0
162
+ add_error "ERROR: No records found for id: #{inst_id} (location: #{location_in_source})"
163
+ end
164
+ nil
165
+ end
166
+
167
+ ####################################
168
+ # Type validations
169
+ ####################################
170
+ def validate_source_type entry, row_num
171
+ is_valid = true
172
+
173
+ unless source_types.include? entry.source_type
174
+ add_error "Invalid source type in row: #{row_num+1}; expected one of #{VALID_SOURCE_TYPES.join ', '}; got: '#{entry.source_type}'"
175
+ is_valid = false
176
+ end
177
+ is_valid
178
+ end
179
+
180
+ def validate_urls entry, row_num
181
+ is_valid = true
182
+ URI_COLUMNS.each do |col|
183
+ if entry[col].present? && entry[col].to_s !~ URI_REGEXP
184
+ add_error "Invalid URL in row: #{row_num+1}; col.: #{col}: '#{entry[col]}'"
185
+ is_valid = false
186
+ end
187
+ end
188
+ is_valid
189
+ end
190
+
191
+ def validate_qids entry, row_num
192
+ is_valid = true
193
+ QID_COLUMNS.each do |col|
194
+ unless entry[col].to_s =~ QID_REGEXP
195
+ is_valid = false
196
+ add_error "Invalid QID in row: #{row_num+1}; col.: #{col}: '#{entry[col]}'"
197
+ end
198
+ end
199
+ is_valid
200
+ end
201
+
202
+ def validate_dates entry, row_num
203
+ is_valid = true
204
+ DATE_TIME_COLUMNS.each do |col|
205
+ next if entry[col].blank?
206
+ begin
207
+ Date.parse entry[col]
208
+ rescue Date::Error
209
+ is_valid = false
210
+ add_error "Invalid date in row: #{row_num+1}, col.: #{col}: '#{entry[col]}'"
211
+ end
212
+ end
213
+ is_valid
214
+ end
215
+
216
+ # Adds an error message to the list of errors.
217
+ #
218
+ # @param message [String] the error message to add
219
+ # @return [void]
220
+ def add_error message
221
+ @errors << message
222
+ end
223
+
224
+ # Checks if there are any errors in the errors collection.
225
+ #
226
+ # @return [Boolean] true if there are errors, false otherwise
227
+ def has_errors?
228
+ errors.any?
229
+ end
230
+
231
+ # Retrieves the appropriate ID validator for the given source type.
232
+ #
233
+ # @param source_type [Symbol] the type of the source
234
+ # @return [DS::Manifest::BaseIdValidator] the ID validator for the source type
235
+ # @raise [NotImplementedError] if the source type is not implemented
236
+ def get_id_validator source_type
237
+ case source_type
238
+ when DS::Constants::MARC_XML
239
+ @id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::MarcXML.new)
240
+ when DS::Constants::DS_METS
241
+ @id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::DSMetsXML.new)
242
+ when DS::Constants::TEI_XML
243
+ @id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::TeiXML.new)
244
+ when DS::Constants::DS_CSV
245
+ @id_validators[source_type] ||= DsCsvIdValidator.new(DS::Source::DSCSV.new)
246
+ else
247
+ raise NotImplementedError, "validate_ids not implemented for: #{source_type}"
248
+ end
249
+ end
250
+
251
+ def source_types
252
+ VALID_SOURCE_TYPES
253
+ end
254
+ end
255
+ end
256
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Manifest
5
+ class SimpleXmlIdValidator < BaseIdValidator
6
+
7
+ attr_accessor :namespaces
8
+
9
+ def initialize source, namespaces = {}
10
+ @namespaces = namespaces.present? ? namespaces : DS::Constants::XML_NAMESPACES
11
+ super source
12
+ end
13
+
14
+ # Locates a record in the XML document based on the given source path, ID, and ID location.
15
+ #
16
+ # +id_location+ should be a template XPath expression that
17
+ # returns one or more records, for example:
18
+ #
19
+ # "//record[controlfield[@tag='001'] = 'ID_PLACEHOLDER']"
20
+ #
21
+ # The string 'ID_PLACEHOLDER' must be in the template.It will
22
+ # be replaced with the ID of the record to locate.
23
+ #
24
+ # @param source_path [String] the path to the XML source file
25
+ # @param id [String] the ID of the record to locate
26
+ # @param id_location [String] the XPath expression to locate the record
27
+ # @return [Nokogiri::XML::NodeSet] the located record(s)
28
+ def locate_record source_path, id, id_location
29
+ locator = DS::Extractor::XmlRecordLocator.new namespaces: namespaces
30
+ xml = source.load_source source_path
31
+ locator.locate_record xml, id, id_location
32
+ end
33
+
34
+ def try_locate_record xml, xpath, namespaces: nil
35
+ xml.xpath xpath, namespaces
36
+ rescue Nokogiri::XML::XPath::SyntaxError => e
37
+ raise unless e.message =~ /undefined namespace prefix/i
38
+ []
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+
4
+ require_relative 'manifest/constants'
5
+ require_relative 'manifest/entry'
6
+ require_relative 'manifest/manifest'
7
+ require_relative 'manifest/base_id_validator'
8
+ require_relative 'manifest/simple_xml_id_validator'
9
+ require_relative 'manifest/ds_csv_id_validator'
10
+ require_relative 'manifest/manifest_validator'
11
+
12
+ module DS
13
+ ##
14
+ # {DS::Manifest} comprises classes and a module (DS::Manifest::Constants)
15
+ # for encapsulating and validating a DS delivery manifest CSV.
16
+ #
17
+ # The manifest CSV provides all information needed to ingest a set of
18
+ # source records. This information is detailed in
19
+ # {DS::Manifest::Manifest} and {DS::Manifest::Entry}.
20
+ #
21
+ # The {DS::Manifest::ManifestValidator} validates
22
+ # that the Manifest is completed and well-formed, and that all records
23
+ # can be found the specified source diretory.
24
+ #
25
+ # The valid {DS::Manifest::Manifest} is used by {DS::Converter::Converter}
26
+ # to orchestrate mapping of source record data for the creation of
27
+ # the DS import CSV.
28
+ module Manifest
29
+ end
30
+ end
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Mapper
5
+ # DS::Mapper::BaseMapper abstract Mapper class. Implementing classes
6
+ # map DS sources records to CSV rows.
7
+ #
8
+ # Implementing classes must implement:
9
+ #
10
+ # - `#extract_record`
11
+ # - `#map_record`
12
+ #
13
+ class BaseMapper
14
+ attr_reader :timestamp
15
+ attr_reader :source_dir
16
+ attr_reader :source
17
+ attr_reader :recon_builder
18
+
19
+ PLACES_COLUMN_MAP = {
20
+ production_place_as_recorded: :place_as_recorded,
21
+ production_place_ds_qid: :ds_qid
22
+ }
23
+
24
+ TITLES_COLUMN_MAP = {
25
+ title_as_recorded: :title_as_recorded,
26
+ title_as_recorded_agr: :title_as_recorded_agr,
27
+ uniform_title_as_recorded: :uniform_title_as_recorded,
28
+ uniform_title_agr: :uniform_title_as_recorded_agr,
29
+ standard_title_ds_qid: :ds_qid
30
+ }
31
+
32
+ GENRES_COLUMN_MAP = {
33
+ genre_as_recorded: :genre_as_recorded,
34
+ genre_ds_qid: :ds_qid
35
+ }
36
+
37
+ SUBJECTS_COLUMN_MAP = {
38
+ subject_as_recorded: :subject_as_recorded,
39
+ subject_ds_qid: :ds_qid,
40
+ }
41
+
42
+ AUTHORS_COLUMN_MAP = {
43
+ author_as_recorded: :name_as_recorded,
44
+ author_as_recorded_agr: :name_agr,
45
+ author_ds_qid: :ds_qid
46
+
47
+ }
48
+
49
+ ARTISTS_COLUMN_MAP = {
50
+ artist_as_recorded: :name_as_recorded,
51
+ artist_as_recorded_agr: :name_agr,
52
+ artist_ds_qid: :ds_qid
53
+ }
54
+
55
+ SCRIBES_COLUMN_MAP = {
56
+ scribe_as_recorded: :name_as_recorded,
57
+ scribe_as_recorded_agr: :name_agr,
58
+ scribe_ds_qid: :ds_qid
59
+ }
60
+
61
+ ASSOCIATED_AGENT_COLUMN_MAP = {
62
+ associated_agent_as_recorded: :name_as_recorded,
63
+ associated_agent_as_recorded_agr: :name_agr,
64
+ associated_agent_ds_qid: :ds_qid
65
+ }
66
+
67
+ LANGUAGE_COLUMN_MAP = {
68
+ language_as_recorded: :language_as_recorded,
69
+ language_ds_qid: :ds_qid
70
+ }
71
+
72
+ FORMER_OWNER_COLUMN_MAP = {
73
+ former_owner_as_recorded: :name_as_recorded,
74
+ former_owner_as_recorded_agr: :name_agr,
75
+ former_owner_ds_qid: :ds_qid
76
+ }
77
+
78
+ MATERIAL_COLUMN_MAP = {
79
+ material_as_recorded: :material_as_recorded,
80
+ material_ds_qid: :ds_qid
81
+ }
82
+
83
+ # Maps recon type to column map
84
+ RECON_TYPE_COLUMN_MAP = {
85
+ Recon::Type::Places => PLACES_COLUMN_MAP,
86
+ Recon::Type::Titles => TITLES_COLUMN_MAP,
87
+ Recon::Type::Genres => GENRES_COLUMN_MAP,
88
+ Recon::Type::AllSubjects => SUBJECTS_COLUMN_MAP,
89
+ Recon::Type::Authors => AUTHORS_COLUMN_MAP,
90
+ Recon::Type::Artists => ARTISTS_COLUMN_MAP,
91
+ Recon::Type::Scribes => SCRIBES_COLUMN_MAP,
92
+ Recon::Type::AssociatedAgents => ASSOCIATED_AGENT_COLUMN_MAP,
93
+ Recon::Type::Languages => LANGUAGE_COLUMN_MAP,
94
+ Recon::Type::FormerOwners => FORMER_OWNER_COLUMN_MAP,
95
+ Recon::Type::Materials => MATERIAL_COLUMN_MAP,
96
+ }.freeze
97
+
98
+ # Initializes a new instance of the class.
99
+ #
100
+ # @param source_dir [String] the directory where the source files are located
101
+ # @param timestamp [Time] the timestamp of the source files
102
+ # @param source [DS::Source::BaseSource] the source object
103
+ # @return [void]
104
+ def initialize source_dir:, timestamp:, source:
105
+ @recon_builder = Recon::ReconBuilder.new source_type: source.source_type, files: [], out_dir: []
106
+ @source = source
107
+ @source_dir = source_dir
108
+ @timestamp = timestamp
109
+ end
110
+
111
+ def to_s # :nodoc:
112
+ "#{self.class.name}: source_dir: #{source_dir}, timestamp: #{timestamp}, source: #{source}"
113
+ end
114
+
115
+ # Extracts a record from the source for the given manifest entry.
116
+ #
117
+ # @param [DS::Manifest::Entry] entry the entry representing one row in a manifest
118
+ # @return [Object] the extracted record; e.g., a Nokogiri::XML::Node or CSV::Row
119
+ # @raise [NotImplementedError] if the method is not implemented in a subclass
120
+ def extract_record entry
121
+ raise NotImplementedError
122
+ end
123
+
124
+ # Maps a source record for the given manifest entry.
125
+ #
126
+ # @param [DS::Manifest::Entry] entry the entry representing one row in a manifest
127
+ # @return [Hash<Symbol, String>] the mapped record
128
+ # @raise [NotImplementedError] if the method is not implemented in a subclass
129
+ def map_record entry
130
+ raise NotImplementedError
131
+ end
132
+
133
+ # Builds term strings based on the given recons and column mapping.
134
+ #
135
+ #
136
+ # @example
137
+ # recons = [
138
+ # { as_recorded: 'Brown, Jamie', authorized_label: 'Jamie Brown' },
139
+ # { as_recorded: 'Hendrix, Morgan', authorized_label: 'Morgan Hendrix' }
140
+ # ]
141
+ # column_map = { author_as_recorded: :as_recorded, author_label: :authorized_label }
142
+ # build_term_strings(recons, column_map)
143
+ # # => { 'author_as_recorded' => 'Brown, Jamie| Hendrix, Morgan', :author_label => 'Jamie Brown|Morgan Hendrix', 'author_label' => 'Jamie Brown|Morgan Hendrix' }
144
+ #
145
+ # @param [Array<Hash>] recons the recons to build term strings from
146
+ # @param [Hash] column_map a mapping of import CSV columns to recon keys
147
+ # @return [Hash] a hash with import CSV columns as keys and corresponding term strings as values
148
+ def build_term_strings recons, column_map
149
+ column_map.inject({}) do |hash, (import_csv_col, recon_key)|
150
+ hash[import_csv_col] = build_term_string recons, recon_key
151
+ hash
152
+ end
153
+ end
154
+
155
+ # Creates an import CSV hash for the given record for all recon
156
+ # term types, using the given extractor. The extractor is one
157
+ # of
158
+ #
159
+ # DS::Extractor::MarcXml
160
+ # DS::Extractor::TeiXml
161
+ # DS::Extractor::DsCsvExtractor
162
+ # DS::Extractor::DsMetsXml
163
+ #
164
+ # The following recon term types are mapped for all
165
+ # records/extractors:
166
+ #
167
+ # Recon::Type::Places
168
+ # Recon::Type::Titles
169
+ # Recon::Type::Genres
170
+ # Recon::Type::Subjects
171
+ # Recon::Type::Authors
172
+ # Recon::Type::Artists
173
+ # Recon::Type::Scribes
174
+ # Recon::Type::AssociatedAgents
175
+ # Recon::Type::Languages
176
+ # Recon::Type::FormerOwners
177
+ # Recon::Type::Materials
178
+ #
179
+ # Column mappings are defined in DS::Mapper::RECON_TYPE_COLUMN_MAP
180
+ #
181
+ # @param [DS::Extractor::MarcXml, DS::Extractor::TeiXml, DS::Extractor::DsCsvExtractor, DS::Extractor::DsMetsXml] extractor the extractor object
182
+ # @param [Nokogiri::XML::Node, CSV::Row] record the record to extract terms from
183
+ # @return [Hash<Symbol, String>] a hash of terms mapped to import CSV columns
184
+ def build_term_maps extractor, record
185
+ RECON_TYPE_COLUMN_MAP.inject({}) { |hash, (recon_type, column_map)|
186
+ terms = recon_type.method_name.flat_map { |method| extractor.send(method, record) }
187
+ hash.update map_terms terms, recon_type, column_map
188
+ }
189
+ end
190
+
191
+ # Builds a term string by concatenating the values of the given reconstructions
192
+ # corresponding to the specified recon key, separated by '|'.
193
+ #
194
+ # @example
195
+ # recons = [
196
+ # { as_recorded: 'Brown, Jamie', authorized_label: 'Jamie Brown' },
197
+ # { as_recorded: 'Hendrix, Morgan', authorized_label: 'Morgan Hendrix' }
198
+ # ]
199
+ # build_term_string(recons, :as_recorded) # => 'Brown, Jamie|Hendrix, Morgan'
200
+ #
201
+ # @param recons [Array<Hash>] The array of recons hashes
202
+ # @param recon_key [String, Symbol] The key used to access the values in each recon hash
203
+ # @return [String] The concatenated term string.
204
+ def build_term_string recons, recon_key
205
+ recons.map { |recon| recon[recon_key.to_sym] }.join('|')
206
+ end
207
+
208
+ # Maps the given terms using the given recon type and column mapping.
209
+ #
210
+ # @param terms [Array<DS::Extractor::BaseTerm>] an array of terms to map
211
+ # @param recon_type [Recon::Type::ReconType] a recon type configuration
212
+ # @param column_map [Hash] a mapping of import CSV columns to recon keys
213
+ # @return [Hash<Symbol,String>] an hash of mapped terms; e.g., { :author_as_recorded => 'Brown, Jamie|Hendrix, Morgan', :author_label => 'Jamie Brown|Morgan Hendrix', ... } for an array of mapped terms
214
+ def map_terms terms, recon_type, column_map
215
+ recons = recon_builder.build_all_recons terms, recon_type
216
+ term_strings = build_term_strings recons, column_map
217
+ term_strings
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Mapper
5
+
6
+ class DSCSVMapper < DS::Mapper::BaseMapper
7
+
8
+
9
+ def initialize(source_dir:, timestamp:)
10
+ super(
11
+ source_dir: source_dir,
12
+ timestamp: timestamp,
13
+ source: DS::Source::DSCSV.new,
14
+ )
15
+ end
16
+
17
+ def map_record entry
18
+ record = extract_record entry
19
+ source_type = 'ds-csv'
20
+ source_file = entry.filename
21
+ ds_id = entry.ds_id
22
+ date_added = ''
23
+ date_last_updated = ''
24
+ dated = entry.dated?
25
+ cataloging_convention = DS::Extractor::DsCsvExtractor.extract_cataloging_convention(record)
26
+ holding_institution_ds_qid = entry.institution_ds_qid
27
+ holding_institution_as_recorded = entry.institution_wikidata_label
28
+ holding_institution_id_number = entry.institutional_id
29
+ holding_institution_shelfmark = entry.call_number
30
+ link_to_holding_institution_record = entry.link_to_institutional_record
31
+ iiif_manifest = entry.iiif_manifest_url
32
+ production_date_as_recorded = DS::Extractor::DsCsvExtractor.extract_production_date_as_recorded(record).join '|'
33
+ production_date = DS::Extractor::DsCsvExtractor.extract_date_range(record, range_sep: '^').join '|'
34
+ century = DS.transform_dates_to_centuries production_date
35
+ century_aat = DS.transform_centuries_to_aat century
36
+ physical_description = DS::Extractor::DsCsvExtractor.extract_physical_description(record).join '|'
37
+ note = DS::Extractor::DsCsvExtractor.extract_notes(record).join '|'
38
+ data_processed_at = timestamp
39
+ data_source_modified = entry.record_last_updated
40
+ acknowledgments = DS::Extractor::DsCsvExtractor.extract_acknowledgments(record).join '|'
41
+
42
+ {
43
+ ds_id: ds_id,
44
+ date_added: date_added,
45
+ date_last_updated: date_last_updated,
46
+ dated: dated,
47
+ source_type: source_type,
48
+ cataloging_convention: cataloging_convention,
49
+ holding_institution_ds_qid: holding_institution_ds_qid,
50
+ holding_institution_as_recorded: holding_institution_as_recorded,
51
+ holding_institution_id_number: holding_institution_id_number,
52
+ holding_institution_shelfmark: holding_institution_shelfmark,
53
+ link_to_holding_institution_record: link_to_holding_institution_record,
54
+ iiif_manifest: iiif_manifest,
55
+ production_date: production_date,
56
+ century: century,
57
+ century_aat: century_aat,
58
+ production_date_as_recorded: production_date_as_recorded,
59
+ physical_description: physical_description,
60
+ note: note,
61
+ data_processed_at: data_processed_at,
62
+ data_source_modified: data_source_modified,
63
+ source_file: source_file,
64
+ acknowledgments: acknowledgments
65
+ }.update build_term_maps DS::Extractor::DsCsvExtractor, record
66
+ end
67
+
68
+ def extract_record entry
69
+ locator = DS::Extractor::CsvRecordLocator.new
70
+ csv = source.load_source File.join(source_dir, entry.filename)
71
+ locator.locate_record(csv, entry.institutional_id, entry.institutional_id_location_in_source).first
72
+ end
73
+
74
+
75
+ end
76
+ end
77
+ end