ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,256 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'uri'
|
5
|
+
require 'date'
|
6
|
+
require_relative './constants'
|
7
|
+
|
8
|
+
module DS
|
9
|
+
module Manifest
|
10
|
+
##
|
11
|
+
# Validate a DS input manifest.
|
12
|
+
#
|
13
|
+
# Validation does the following:
|
14
|
+
#
|
15
|
+
# - Confirms all required columns are present
|
16
|
+
# - Confirms all all required values are present
|
17
|
+
# - Confirms all column values are the correct type
|
18
|
+
# - Confirms all listed input files are present
|
19
|
+
# - Confirms all listed input files match the record
|
20
|
+
# identifier provided in the manifest
|
21
|
+
#
|
22
|
+
# @todo Add test for live URLs
|
23
|
+
class ManifestValidator
|
24
|
+
include DS::Manifest::Constants
|
25
|
+
|
26
|
+
attr_reader :manifest
|
27
|
+
attr_reader :source_dir
|
28
|
+
attr_reader :errors
|
29
|
+
|
30
|
+
URI_REGEXP = URI::DEFAULT_PARSER.make_regexp %w{http https}
|
31
|
+
QID_REGEXP = %r{\AQ\d+\z}
|
32
|
+
|
33
|
+
##
|
34
|
+
# @param [DS::Manifest] manifest DS::Manifest instance
|
35
|
+
# @return [DS::ManifestValidator]
|
36
|
+
def initialize manifest
|
37
|
+
@manifest = manifest
|
38
|
+
@errors = []
|
39
|
+
@id_validators = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# @return [boolean] true if the manifest is valid
|
44
|
+
def valid?
|
45
|
+
return false unless validate_columns
|
46
|
+
return false unless validate_required_values
|
47
|
+
return false unless validate_ids_unique
|
48
|
+
return false unless validate_data_types
|
49
|
+
return false unless validate_files_exist
|
50
|
+
return false unless validate_records_present
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [boolean] true if all required columns are present
|
56
|
+
def validate_columns
|
57
|
+
found_columns = manifest.headers
|
58
|
+
diff = MANIFEST_COLUMNS - found_columns
|
59
|
+
return true if diff.blank?
|
60
|
+
add_error "Manifest missing required columns: #{diff.join ', '}" if diff.present?
|
61
|
+
false
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# @return [boolean] true if all require values present
|
66
|
+
def validate_required_values
|
67
|
+
is_valid = true
|
68
|
+
manifest.each_with_index do |row, ndx|
|
69
|
+
REQUIRED_VALUES.each do |col|
|
70
|
+
if row[col].blank?
|
71
|
+
add_error "Required value missing in row: #{ndx+1}, col.: #{col}"
|
72
|
+
is_valid = false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
is_valid
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# @return [boolean] true if all data types are valid
|
81
|
+
def validate_data_types
|
82
|
+
is_valid = true
|
83
|
+
manifest.each_with_index do |entry, row_num|
|
84
|
+
is_valid = false unless validate_urls entry, row_num
|
85
|
+
is_valid = false unless validate_qids entry, row_num
|
86
|
+
is_valid = false unless validate_dates entry, row_num
|
87
|
+
end
|
88
|
+
is_valid
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# @return [boolean] true if all list input files are present
|
93
|
+
def validate_files_exist
|
94
|
+
is_valid = true
|
95
|
+
manifest.each_with_index do |entry, row_num|
|
96
|
+
file_path = File.join manifest.source_dir, entry.filename
|
97
|
+
unless File.exist? file_path
|
98
|
+
is_valid = false
|
99
|
+
add_error "Source file not found row: #{row_num+1}; source directory: #{source_dir}; file: #{entry.filename}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
is_valid
|
103
|
+
end
|
104
|
+
|
105
|
+
# Validates the uniqueness of all IDs in the manifest.
|
106
|
+
#
|
107
|
+
# This method collects the count of all IDs in the manifest and selects those with a count greater than 1.
|
108
|
+
# It then iterates over the multiples and adds an error for each duplicate ID found.
|
109
|
+
#
|
110
|
+
# Returns:
|
111
|
+
# - `true` if no duplicate IDs are found.
|
112
|
+
# - `false` if duplicate IDs are found.
|
113
|
+
def validate_ids_unique
|
114
|
+
# collect the count of all ids and select those with a count > 1
|
115
|
+
multiples = manifest.inject({}) { |h, id|
|
116
|
+
h[id] ||= 0; h[id] += 1; h
|
117
|
+
}.filter_map { |id, count|
|
118
|
+
[id, count] if count > 1
|
119
|
+
}
|
120
|
+
|
121
|
+
return true if multiples.blank?
|
122
|
+
|
123
|
+
multiples.each do |id, count|
|
124
|
+
add_error "Duplicate ID found in manifest: ID '#{id}' found in #{count} rows"
|
125
|
+
end
|
126
|
+
false
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# @return [boolean] true if all +holding_institution_institutional_id+
|
131
|
+
# values match source file
|
132
|
+
def validate_records_present
|
133
|
+
is_valid = true
|
134
|
+
manifest.each_with_index do |entry, row_num|
|
135
|
+
file_path = File.join manifest.source_dir, entry.filename
|
136
|
+
|
137
|
+
inst_id = entry.institutional_id
|
138
|
+
id_validator = get_id_validator entry.source_type
|
139
|
+
found = id_validator.valid? file_path, inst_id, entry.institutional_id_location_in_source
|
140
|
+
|
141
|
+
unless found
|
142
|
+
is_valid = false
|
143
|
+
id_validator.errors.each { |error| add_error error }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
is_valid
|
147
|
+
end
|
148
|
+
|
149
|
+
# Handles the error when the count of records found for a given `inst_id` and `location_in_source` is
|
150
|
+
# 0 or more than 1.
|
151
|
+
#
|
152
|
+
# @param count [Integer] the number of records found for the given `inst_id` and `location_in_source`
|
153
|
+
# @param inst_id [String] the identifier of the record
|
154
|
+
# @param location_in_source [String] the location in the source where the record is found
|
155
|
+
# @return [nil]
|
156
|
+
def handle_count_error count, inst_id, location_in_source
|
157
|
+
return if count == 1
|
158
|
+
|
159
|
+
if count > 1
|
160
|
+
add_error "ERROR: Multiple records (#{count}) found for id: #{inst_id} (location: #{location_in_source})"
|
161
|
+
elsif count == 0
|
162
|
+
add_error "ERROR: No records found for id: #{inst_id} (location: #{location_in_source})"
|
163
|
+
end
|
164
|
+
nil
|
165
|
+
end
|
166
|
+
|
167
|
+
####################################
|
168
|
+
# Type validations
|
169
|
+
####################################
|
170
|
+
def validate_source_type entry, row_num
|
171
|
+
is_valid = true
|
172
|
+
|
173
|
+
unless source_types.include? entry.source_type
|
174
|
+
add_error "Invalid source type in row: #{row_num+1}; expected one of #{VALID_SOURCE_TYPES.join ', '}; got: '#{entry.source_type}'"
|
175
|
+
is_valid = false
|
176
|
+
end
|
177
|
+
is_valid
|
178
|
+
end
|
179
|
+
|
180
|
+
def validate_urls entry, row_num
|
181
|
+
is_valid = true
|
182
|
+
URI_COLUMNS.each do |col|
|
183
|
+
if entry[col].present? && entry[col].to_s !~ URI_REGEXP
|
184
|
+
add_error "Invalid URL in row: #{row_num+1}; col.: #{col}: '#{entry[col]}'"
|
185
|
+
is_valid = false
|
186
|
+
end
|
187
|
+
end
|
188
|
+
is_valid
|
189
|
+
end
|
190
|
+
|
191
|
+
def validate_qids entry, row_num
|
192
|
+
is_valid = true
|
193
|
+
QID_COLUMNS.each do |col|
|
194
|
+
unless entry[col].to_s =~ QID_REGEXP
|
195
|
+
is_valid = false
|
196
|
+
add_error "Invalid QID in row: #{row_num+1}; col.: #{col}: '#{entry[col]}'"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
is_valid
|
200
|
+
end
|
201
|
+
|
202
|
+
def validate_dates entry, row_num
|
203
|
+
is_valid = true
|
204
|
+
DATE_TIME_COLUMNS.each do |col|
|
205
|
+
next if entry[col].blank?
|
206
|
+
begin
|
207
|
+
Date.parse entry[col]
|
208
|
+
rescue Date::Error
|
209
|
+
is_valid = false
|
210
|
+
add_error "Invalid date in row: #{row_num+1}, col.: #{col}: '#{entry[col]}'"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
is_valid
|
214
|
+
end
|
215
|
+
|
216
|
+
# Adds an error message to the list of errors.
|
217
|
+
#
|
218
|
+
# @param message [String] the error message to add
|
219
|
+
# @return [void]
|
220
|
+
def add_error message
|
221
|
+
@errors << message
|
222
|
+
end
|
223
|
+
|
224
|
+
# Checks if there are any errors in the errors collection.
|
225
|
+
#
|
226
|
+
# @return [Boolean] true if there are errors, false otherwise
|
227
|
+
def has_errors?
|
228
|
+
errors.any?
|
229
|
+
end
|
230
|
+
|
231
|
+
# Retrieves the appropriate ID validator for the given source type.
|
232
|
+
#
|
233
|
+
# @param source_type [Symbol] the type of the source
|
234
|
+
# @return [DS::Manifest::BaseIdValidator] the ID validator for the source type
|
235
|
+
# @raise [NotImplementedError] if the source type is not implemented
|
236
|
+
def get_id_validator source_type
|
237
|
+
case source_type
|
238
|
+
when DS::Constants::MARC_XML
|
239
|
+
@id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::MarcXML.new)
|
240
|
+
when DS::Constants::DS_METS
|
241
|
+
@id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::DSMetsXML.new)
|
242
|
+
when DS::Constants::TEI_XML
|
243
|
+
@id_validators[source_type] ||= SimpleXmlIdValidator.new(DS::Source::TeiXML.new)
|
244
|
+
when DS::Constants::DS_CSV
|
245
|
+
@id_validators[source_type] ||= DsCsvIdValidator.new(DS::Source::DSCSV.new)
|
246
|
+
else
|
247
|
+
raise NotImplementedError, "validate_ids not implemented for: #{source_type}"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def source_types
|
252
|
+
VALID_SOURCE_TYPES
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Manifest
|
5
|
+
class SimpleXmlIdValidator < BaseIdValidator
|
6
|
+
|
7
|
+
attr_accessor :namespaces
|
8
|
+
|
9
|
+
def initialize source, namespaces = {}
|
10
|
+
@namespaces = namespaces.present? ? namespaces : DS::Constants::XML_NAMESPACES
|
11
|
+
super source
|
12
|
+
end
|
13
|
+
|
14
|
+
# Locates a record in the XML document based on the given source path, ID, and ID location.
|
15
|
+
#
|
16
|
+
# +id_location+ should be a template XPath expression that
|
17
|
+
# returns one or more records, for example:
|
18
|
+
#
|
19
|
+
# "//record[controlfield[@tag='001'] = 'ID_PLACEHOLDER']"
|
20
|
+
#
|
21
|
+
# The string 'ID_PLACEHOLDER' must be in the template.It will
|
22
|
+
# be replaced with the ID of the record to locate.
|
23
|
+
#
|
24
|
+
# @param source_path [String] the path to the XML source file
|
25
|
+
# @param id [String] the ID of the record to locate
|
26
|
+
# @param id_location [String] the XPath expression to locate the record
|
27
|
+
# @return [Nokogiri::XML::NodeSet] the located record(s)
|
28
|
+
def locate_record source_path, id, id_location
|
29
|
+
locator = DS::Extractor::XmlRecordLocator.new namespaces: namespaces
|
30
|
+
xml = source.load_source source_path
|
31
|
+
locator.locate_record xml, id, id_location
|
32
|
+
end
|
33
|
+
|
34
|
+
def try_locate_record xml, xpath, namespaces: nil
|
35
|
+
xml.xpath xpath, namespaces
|
36
|
+
rescue Nokogiri::XML::XPath::SyntaxError => e
|
37
|
+
raise unless e.message =~ /undefined namespace prefix/i
|
38
|
+
[]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/ds/manifest.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
|
4
|
+
require_relative 'manifest/constants'
|
5
|
+
require_relative 'manifest/entry'
|
6
|
+
require_relative 'manifest/manifest'
|
7
|
+
require_relative 'manifest/base_id_validator'
|
8
|
+
require_relative 'manifest/simple_xml_id_validator'
|
9
|
+
require_relative 'manifest/ds_csv_id_validator'
|
10
|
+
require_relative 'manifest/manifest_validator'
|
11
|
+
|
12
|
+
module DS
|
13
|
+
##
|
14
|
+
# {DS::Manifest} comprises classes and a module (DS::Manifest::Constants)
|
15
|
+
# for encapsulating and validating a DS delivery manifest CSV.
|
16
|
+
#
|
17
|
+
# The manifest CSV provides all information needed to ingest a set of
|
18
|
+
# source records. This information is detailed in
|
19
|
+
# {DS::Manifest::Manifest} and {DS::Manifest::Entry}.
|
20
|
+
#
|
21
|
+
# The {DS::Manifest::ManifestValidator} validates
|
22
|
+
# that the Manifest is completed and well-formed, and that all records
|
23
|
+
# can be found the specified source diretory.
|
24
|
+
#
|
25
|
+
# The valid {DS::Manifest::Manifest} is used by {DS::Converter::Converter}
|
26
|
+
# to orchestrate mapping of source record data for the creation of
|
27
|
+
# the DS import CSV.
|
28
|
+
module Manifest
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Mapper
|
5
|
+
# DS::Mapper::BaseMapper abstract Mapper class. Implementing classes
|
6
|
+
# map DS sources records to CSV rows.
|
7
|
+
#
|
8
|
+
# Implementing classes must implement:
|
9
|
+
#
|
10
|
+
# - `#extract_record`
|
11
|
+
# - `#map_record`
|
12
|
+
#
|
13
|
+
class BaseMapper
|
14
|
+
attr_reader :timestamp
|
15
|
+
attr_reader :source_dir
|
16
|
+
attr_reader :source
|
17
|
+
attr_reader :recon_builder
|
18
|
+
|
19
|
+
PLACES_COLUMN_MAP = {
|
20
|
+
production_place_as_recorded: :place_as_recorded,
|
21
|
+
production_place_ds_qid: :ds_qid
|
22
|
+
}
|
23
|
+
|
24
|
+
TITLES_COLUMN_MAP = {
|
25
|
+
title_as_recorded: :title_as_recorded,
|
26
|
+
title_as_recorded_agr: :title_as_recorded_agr,
|
27
|
+
uniform_title_as_recorded: :uniform_title_as_recorded,
|
28
|
+
uniform_title_agr: :uniform_title_as_recorded_agr,
|
29
|
+
standard_title_ds_qid: :ds_qid
|
30
|
+
}
|
31
|
+
|
32
|
+
GENRES_COLUMN_MAP = {
|
33
|
+
genre_as_recorded: :genre_as_recorded,
|
34
|
+
genre_ds_qid: :ds_qid
|
35
|
+
}
|
36
|
+
|
37
|
+
SUBJECTS_COLUMN_MAP = {
|
38
|
+
subject_as_recorded: :subject_as_recorded,
|
39
|
+
subject_ds_qid: :ds_qid,
|
40
|
+
}
|
41
|
+
|
42
|
+
AUTHORS_COLUMN_MAP = {
|
43
|
+
author_as_recorded: :name_as_recorded,
|
44
|
+
author_as_recorded_agr: :name_agr,
|
45
|
+
author_ds_qid: :ds_qid
|
46
|
+
|
47
|
+
}
|
48
|
+
|
49
|
+
ARTISTS_COLUMN_MAP = {
|
50
|
+
artist_as_recorded: :name_as_recorded,
|
51
|
+
artist_as_recorded_agr: :name_agr,
|
52
|
+
artist_ds_qid: :ds_qid
|
53
|
+
}
|
54
|
+
|
55
|
+
SCRIBES_COLUMN_MAP = {
|
56
|
+
scribe_as_recorded: :name_as_recorded,
|
57
|
+
scribe_as_recorded_agr: :name_agr,
|
58
|
+
scribe_ds_qid: :ds_qid
|
59
|
+
}
|
60
|
+
|
61
|
+
ASSOCIATED_AGENT_COLUMN_MAP = {
|
62
|
+
associated_agent_as_recorded: :name_as_recorded,
|
63
|
+
associated_agent_as_recorded_agr: :name_agr,
|
64
|
+
associated_agent_ds_qid: :ds_qid
|
65
|
+
}
|
66
|
+
|
67
|
+
LANGUAGE_COLUMN_MAP = {
|
68
|
+
language_as_recorded: :language_as_recorded,
|
69
|
+
language_ds_qid: :ds_qid
|
70
|
+
}
|
71
|
+
|
72
|
+
FORMER_OWNER_COLUMN_MAP = {
|
73
|
+
former_owner_as_recorded: :name_as_recorded,
|
74
|
+
former_owner_as_recorded_agr: :name_agr,
|
75
|
+
former_owner_ds_qid: :ds_qid
|
76
|
+
}
|
77
|
+
|
78
|
+
MATERIAL_COLUMN_MAP = {
|
79
|
+
material_as_recorded: :material_as_recorded,
|
80
|
+
material_ds_qid: :ds_qid
|
81
|
+
}
|
82
|
+
|
83
|
+
# Maps recon type to column map
|
84
|
+
RECON_TYPE_COLUMN_MAP = {
|
85
|
+
Recon::Type::Places => PLACES_COLUMN_MAP,
|
86
|
+
Recon::Type::Titles => TITLES_COLUMN_MAP,
|
87
|
+
Recon::Type::Genres => GENRES_COLUMN_MAP,
|
88
|
+
Recon::Type::AllSubjects => SUBJECTS_COLUMN_MAP,
|
89
|
+
Recon::Type::Authors => AUTHORS_COLUMN_MAP,
|
90
|
+
Recon::Type::Artists => ARTISTS_COLUMN_MAP,
|
91
|
+
Recon::Type::Scribes => SCRIBES_COLUMN_MAP,
|
92
|
+
Recon::Type::AssociatedAgents => ASSOCIATED_AGENT_COLUMN_MAP,
|
93
|
+
Recon::Type::Languages => LANGUAGE_COLUMN_MAP,
|
94
|
+
Recon::Type::FormerOwners => FORMER_OWNER_COLUMN_MAP,
|
95
|
+
Recon::Type::Materials => MATERIAL_COLUMN_MAP,
|
96
|
+
}.freeze
|
97
|
+
|
98
|
+
# Initializes a new instance of the class.
|
99
|
+
#
|
100
|
+
# @param source_dir [String] the directory where the source files are located
|
101
|
+
# @param timestamp [Time] the timestamp of the source files
|
102
|
+
# @param source [DS::Source::BaseSource] the source object
|
103
|
+
# @return [void]
|
104
|
+
def initialize source_dir:, timestamp:, source:
|
105
|
+
@recon_builder = Recon::ReconBuilder.new source_type: source.source_type, files: [], out_dir: []
|
106
|
+
@source = source
|
107
|
+
@source_dir = source_dir
|
108
|
+
@timestamp = timestamp
|
109
|
+
end
|
110
|
+
|
111
|
+
def to_s # :nodoc:
|
112
|
+
"#{self.class.name}: source_dir: #{source_dir}, timestamp: #{timestamp}, source: #{source}"
|
113
|
+
end
|
114
|
+
|
115
|
+
# Extracts a record from the source for the given manifest entry.
|
116
|
+
#
|
117
|
+
# @param [DS::Manifest::Entry] entry the entry representing one row in a manifest
|
118
|
+
# @return [Object] the extracted record; e.g., a Nokogiri::XML::Node or CSV::Row
|
119
|
+
# @raise [NotImplementedError] if the method is not implemented in a subclass
|
120
|
+
def extract_record entry
|
121
|
+
raise NotImplementedError
|
122
|
+
end
|
123
|
+
|
124
|
+
# Maps a source record for the given manifest entry.
|
125
|
+
#
|
126
|
+
# @param [DS::Manifest::Entry] entry the entry representing one row in a manifest
|
127
|
+
# @return [Hash<Symbol, String>] the mapped record
|
128
|
+
# @raise [NotImplementedError] if the method is not implemented in a subclass
|
129
|
+
def map_record entry
|
130
|
+
raise NotImplementedError
|
131
|
+
end
|
132
|
+
|
133
|
+
# Builds term strings based on the given recons and column mapping.
|
134
|
+
#
|
135
|
+
#
|
136
|
+
# @example
|
137
|
+
# recons = [
|
138
|
+
# { as_recorded: 'Brown, Jamie', authorized_label: 'Jamie Brown' },
|
139
|
+
# { as_recorded: 'Hendrix, Morgan', authorized_label: 'Morgan Hendrix' }
|
140
|
+
# ]
|
141
|
+
# column_map = { author_as_recorded: :as_recorded, author_label: :authorized_label }
|
142
|
+
# build_term_strings(recons, column_map)
|
143
|
+
# # => { 'author_as_recorded' => 'Brown, Jamie| Hendrix, Morgan', :author_label => 'Jamie Brown|Morgan Hendrix', 'author_label' => 'Jamie Brown|Morgan Hendrix' }
|
144
|
+
#
|
145
|
+
# @param [Array<Hash>] recons the recons to build term strings from
|
146
|
+
# @param [Hash] column_map a mapping of import CSV columns to recon keys
|
147
|
+
# @return [Hash] a hash with import CSV columns as keys and corresponding term strings as values
|
148
|
+
def build_term_strings recons, column_map
|
149
|
+
column_map.inject({}) do |hash, (import_csv_col, recon_key)|
|
150
|
+
hash[import_csv_col] = build_term_string recons, recon_key
|
151
|
+
hash
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Creates an import CSV hash for the given record for all recon
|
156
|
+
# term types, using the given extractor. The extractor is one
|
157
|
+
# of
|
158
|
+
#
|
159
|
+
# DS::Extractor::MarcXml
|
160
|
+
# DS::Extractor::TeiXml
|
161
|
+
# DS::Extractor::DsCsvExtractor
|
162
|
+
# DS::Extractor::DsMetsXml
|
163
|
+
#
|
164
|
+
# The following recon term types are mapped for all
|
165
|
+
# records/extractors:
|
166
|
+
#
|
167
|
+
# Recon::Type::Places
|
168
|
+
# Recon::Type::Titles
|
169
|
+
# Recon::Type::Genres
|
170
|
+
# Recon::Type::Subjects
|
171
|
+
# Recon::Type::Authors
|
172
|
+
# Recon::Type::Artists
|
173
|
+
# Recon::Type::Scribes
|
174
|
+
# Recon::Type::AssociatedAgents
|
175
|
+
# Recon::Type::Languages
|
176
|
+
# Recon::Type::FormerOwners
|
177
|
+
# Recon::Type::Materials
|
178
|
+
#
|
179
|
+
# Column mappings are defined in DS::Mapper::RECON_TYPE_COLUMN_MAP
|
180
|
+
#
|
181
|
+
# @param [DS::Extractor::MarcXml, DS::Extractor::TeiXml, DS::Extractor::DsCsvExtractor, DS::Extractor::DsMetsXml] extractor the extractor object
|
182
|
+
# @param [Nokogiri::XML::Node, CSV::Row] record the record to extract terms from
|
183
|
+
# @return [Hash<Symbol, String>] a hash of terms mapped to import CSV columns
|
184
|
+
def build_term_maps extractor, record
|
185
|
+
RECON_TYPE_COLUMN_MAP.inject({}) { |hash, (recon_type, column_map)|
|
186
|
+
terms = recon_type.method_name.flat_map { |method| extractor.send(method, record) }
|
187
|
+
hash.update map_terms terms, recon_type, column_map
|
188
|
+
}
|
189
|
+
end
|
190
|
+
|
191
|
+
# Builds a term string by concatenating the values of the given reconstructions
|
192
|
+
# corresponding to the specified recon key, separated by '|'.
|
193
|
+
#
|
194
|
+
# @example
|
195
|
+
# recons = [
|
196
|
+
# { as_recorded: 'Brown, Jamie', authorized_label: 'Jamie Brown' },
|
197
|
+
# { as_recorded: 'Hendrix, Morgan', authorized_label: 'Morgan Hendrix' }
|
198
|
+
# ]
|
199
|
+
# build_term_string(recons, :as_recorded) # => 'Brown, Jamie|Hendrix, Morgan'
|
200
|
+
#
|
201
|
+
# @param recons [Array<Hash>] The array of recons hashes
|
202
|
+
# @param recon_key [String, Symbol] The key used to access the values in each recon hash
|
203
|
+
# @return [String] The concatenated term string.
|
204
|
+
def build_term_string recons, recon_key
|
205
|
+
recons.map { |recon| recon[recon_key.to_sym] }.join('|')
|
206
|
+
end
|
207
|
+
|
208
|
+
# Maps the given terms using the given recon type and column mapping.
|
209
|
+
#
|
210
|
+
# @param terms [Array<DS::Extractor::BaseTerm>] an array of terms to map
|
211
|
+
# @param recon_type [Recon::Type::ReconType] a recon type configuration
|
212
|
+
# @param column_map [Hash] a mapping of import CSV columns to recon keys
|
213
|
+
# @return [Hash<Symbol,String>] an hash of mapped terms; e.g., { :author_as_recorded => 'Brown, Jamie|Hendrix, Morgan', :author_label => 'Jamie Brown|Morgan Hendrix', ... } for an array of mapped terms
|
214
|
+
def map_terms terms, recon_type, column_map
|
215
|
+
recons = recon_builder.build_all_recons terms, recon_type
|
216
|
+
term_strings = build_term_strings recons, column_map
|
217
|
+
term_strings
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Mapper
|
5
|
+
|
6
|
+
class DSCSVMapper < DS::Mapper::BaseMapper
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(source_dir:, timestamp:)
|
10
|
+
super(
|
11
|
+
source_dir: source_dir,
|
12
|
+
timestamp: timestamp,
|
13
|
+
source: DS::Source::DSCSV.new,
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
def map_record entry
|
18
|
+
record = extract_record entry
|
19
|
+
source_type = 'ds-csv'
|
20
|
+
source_file = entry.filename
|
21
|
+
ds_id = entry.ds_id
|
22
|
+
date_added = ''
|
23
|
+
date_last_updated = ''
|
24
|
+
dated = entry.dated?
|
25
|
+
cataloging_convention = DS::Extractor::DsCsvExtractor.extract_cataloging_convention(record)
|
26
|
+
holding_institution_ds_qid = entry.institution_ds_qid
|
27
|
+
holding_institution_as_recorded = entry.institution_wikidata_label
|
28
|
+
holding_institution_id_number = entry.institutional_id
|
29
|
+
holding_institution_shelfmark = entry.call_number
|
30
|
+
link_to_holding_institution_record = entry.link_to_institutional_record
|
31
|
+
iiif_manifest = entry.iiif_manifest_url
|
32
|
+
production_date_as_recorded = DS::Extractor::DsCsvExtractor.extract_production_date_as_recorded(record).join '|'
|
33
|
+
production_date = DS::Extractor::DsCsvExtractor.extract_date_range(record, range_sep: '^').join '|'
|
34
|
+
century = DS.transform_dates_to_centuries production_date
|
35
|
+
century_aat = DS.transform_centuries_to_aat century
|
36
|
+
physical_description = DS::Extractor::DsCsvExtractor.extract_physical_description(record).join '|'
|
37
|
+
note = DS::Extractor::DsCsvExtractor.extract_notes(record).join '|'
|
38
|
+
data_processed_at = timestamp
|
39
|
+
data_source_modified = entry.record_last_updated
|
40
|
+
acknowledgments = DS::Extractor::DsCsvExtractor.extract_acknowledgments(record).join '|'
|
41
|
+
|
42
|
+
{
|
43
|
+
ds_id: ds_id,
|
44
|
+
date_added: date_added,
|
45
|
+
date_last_updated: date_last_updated,
|
46
|
+
dated: dated,
|
47
|
+
source_type: source_type,
|
48
|
+
cataloging_convention: cataloging_convention,
|
49
|
+
holding_institution_ds_qid: holding_institution_ds_qid,
|
50
|
+
holding_institution_as_recorded: holding_institution_as_recorded,
|
51
|
+
holding_institution_id_number: holding_institution_id_number,
|
52
|
+
holding_institution_shelfmark: holding_institution_shelfmark,
|
53
|
+
link_to_holding_institution_record: link_to_holding_institution_record,
|
54
|
+
iiif_manifest: iiif_manifest,
|
55
|
+
production_date: production_date,
|
56
|
+
century: century,
|
57
|
+
century_aat: century_aat,
|
58
|
+
production_date_as_recorded: production_date_as_recorded,
|
59
|
+
physical_description: physical_description,
|
60
|
+
note: note,
|
61
|
+
data_processed_at: data_processed_at,
|
62
|
+
data_source_modified: data_source_modified,
|
63
|
+
source_file: source_file,
|
64
|
+
acknowledgments: acknowledgments
|
65
|
+
}.update build_term_maps DS::Extractor::DsCsvExtractor, record
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_record entry
|
69
|
+
locator = DS::Extractor::CsvRecordLocator.new
|
70
|
+
csv = source.load_source File.join(source_dir, entry.filename)
|
71
|
+
locator.locate_record(csv, entry.institutional_id, entry.institutional_id_location_in_source).first
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|