ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
data/lib/ds/recon.rb
ADDED
@@ -0,0 +1,292 @@
|
|
1
|
+
require_relative 'util'
|
2
|
+
require_relative 'recon/source_enumerator'
|
3
|
+
require_relative 'recon/ds_csv_enumerator'
|
4
|
+
require_relative 'recon/marc_xml_enumerator'
|
5
|
+
require_relative 'recon/tei_xml_enumerator'
|
6
|
+
require_relative 'recon/ds_mets_xml_enumerator'
|
7
|
+
require_relative 'recon/url_lookup'
|
8
|
+
require_relative 'recon/type/recon_type'
|
9
|
+
require_relative 'recon/type/genres'
|
10
|
+
require_relative 'recon/type/languages'
|
11
|
+
require_relative 'recon/type/materials'
|
12
|
+
require_relative 'recon/type/names'
|
13
|
+
require_relative 'recon/type/places'
|
14
|
+
require_relative 'recon/type/subjects'
|
15
|
+
require_relative 'recon/type/splits'
|
16
|
+
require_relative 'recon/type/named_subjects'
|
17
|
+
require_relative 'recon/type/all_subjects'
|
18
|
+
require_relative 'recon/type/titles'
|
19
|
+
require_relative 'recon/recon_builder'
|
20
|
+
require_relative 'recon/recon_manager'
|
21
|
+
require_relative 'constants'
|
22
|
+
require 'logger'
|
23
|
+
require 'csv'
|
24
|
+
require 'ostruct'
|
25
|
+
|
26
|
+
# The DS Recon module contains classes and methods for working with DS
|
27
|
+
# recon data dictionaries.
|
28
|
+
#
|
29
|
+
# The classes in this module manage and support all the following:
|
30
|
+
#
|
31
|
+
# - The loading recon data dictionary CSV files for recon lookups
|
32
|
+
# - The generation of recon CSV files from import sources
|
33
|
+
# - The addition of recon data to import CSVs
|
34
|
+
#
|
35
|
+
# The key modules and classes in the Recon module are:
|
36
|
+
#
|
37
|
+
# - {Recon} -- validation and loading of recon data dictionary CSVs; data dictionary lookups; retrieval and updates of the DS data git repository, which includes the data dictionary CSVs
|
38
|
+
# - {Recon::Type} -- recon type configurations used for lookups, extractions, and column mappings
|
39
|
+
# - {Recon::ReconManager} -- the main interface for the Recon module; used to build and write recon CSVs
|
40
|
+
# - {Recon::ReconBuilder} -- used by the Recon::Manager to build recon values hashes by extracting DS::Extractor::BaseTerm instances from source records and performing lookups
|
41
|
+
# - {Recon::SourceEnumerator} instances -- used by Recon::ReconBuilder to iterate over source records
|
42
|
+
#
|
43
|
+
# @example
|
44
|
+
# require 'ds'
|
45
|
+
# # write the places.csv file for a set of MARC XML files
|
46
|
+
# files = Dir['source/files/*.xml']
|
47
|
+
# recon_manager = Recon::ReconManager.new(
|
48
|
+
# source_type: 'marc-xml',
|
49
|
+
# out_dir: 'path/to/dir',
|
50
|
+
# files: files
|
51
|
+
# )
|
52
|
+
# recon_type = Recon.find_recon_type :places
|
53
|
+
# recon_manager.write_csv recon_type # => 'path/to/dir/places.csv'
|
54
|
+
#
|
55
|
+
#
|
56
|
+
module Recon
|
57
|
+
|
58
|
+
ERROR_UNBALANCED_SUBFIELDS = 'Row has unmatched subfields'
|
59
|
+
ERROR_BLANK_SUBFIELDS = 'Row has blank subfields'
|
60
|
+
ERROR_MISSING_REQUIRED_COLUMNS = "CSV is missing required column(s)"
|
61
|
+
ERROR_CSV_FILE_NOT_FOUND = 'Recon CSV file cannot be found'
|
62
|
+
|
63
|
+
|
64
|
+
RECON_SETS = %i{
|
65
|
+
genres
|
66
|
+
languages
|
67
|
+
materials
|
68
|
+
named-subjects
|
69
|
+
names
|
70
|
+
places
|
71
|
+
subjects
|
72
|
+
titles
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
RECON_TYPES_MAP = {
|
77
|
+
:genres => Recon::Type::Genres,
|
78
|
+
:languages => Recon::Type::Languages,
|
79
|
+
:materials => Recon::Type::Materials,
|
80
|
+
:'all-subjects' => Recon::Type::Subjects,
|
81
|
+
:'named-subjects' => Recon::Type::NamedSubjects,
|
82
|
+
:names => Recon::Type::Names,
|
83
|
+
:places => Recon::Type::Places,
|
84
|
+
:subjects => Recon::Type::Subjects,
|
85
|
+
:titles => Recon::Type::Titles,
|
86
|
+
:splits => Recon::Type::Splits
|
87
|
+
}.freeze
|
88
|
+
|
89
|
+
RECON_TYPES = RECON_TYPES_MAP.values.freeze
|
90
|
+
|
91
|
+
RECON_VALIDATION_SETS = RECON_TYPES.map(&:set_name).freeze
|
92
|
+
|
93
|
+
|
94
|
+
# For the recon data dictionary with +set_name+, find the value in the +column+ with the
|
95
|
+
# key +values.join('$$')+.
|
96
|
+
#
|
97
|
+
# @param [String] set_name the name of the set to look up
|
98
|
+
# @param [String] values the lookup key values
|
99
|
+
# @param [Symbol] column the column value to retrieve
|
100
|
+
# @return [Object, nil] the value found in the specified column, or nil if not found
|
101
|
+
def self.lookup_single set_name, key_values:, column:
|
102
|
+
recon_set = find_set set_name
|
103
|
+
key = build_key key_values
|
104
|
+
return recon_set.dig key, column if recon_set.include? key
|
105
|
+
|
106
|
+
# try a key with a "cleaned" string
|
107
|
+
alt_key = build_alt_key key
|
108
|
+
recon_set.dig(alt_key, column)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Return the set of terms for the given set name.
|
112
|
+
#
|
113
|
+
# @param set_name [String] the name of the set
|
114
|
+
# @return [Hash<String, Struct>, nil] the set of terms, with keys as values and term structs as values
|
115
|
+
def self.find_set set_name
|
116
|
+
@@reconciliations ||= {}
|
117
|
+
@@reconciliations[set_name] ||= load_set set_name
|
118
|
+
end
|
119
|
+
|
120
|
+
# The path to the DS data git repository
|
121
|
+
#
|
122
|
+
# @return [String] the path to the DS data git repository
|
123
|
+
def self.git_repo
|
124
|
+
File.join Settings.recon.local_dir, Settings.recon.git_local_name
|
125
|
+
end
|
126
|
+
|
127
|
+
# Finds the +config/settings.yml+ configuration for the given set name.
|
128
|
+
#
|
129
|
+
# @param set_name [String] the name of the set
|
130
|
+
# @return [Hash<String, Object>] the configuration for the set name, or nil if not found
|
131
|
+
# @raise [DSError] if the set name is not found
|
132
|
+
def self.find_set_config name
|
133
|
+
config = Settings.recon.sets.find { |s| s.name == name }
|
134
|
+
raise DSError, "Unknown set name: #{name.inspect}" unless config
|
135
|
+
config
|
136
|
+
end
|
137
|
+
|
138
|
+
# Finds the reconciliation type configuration for the given set name.
|
139
|
+
#
|
140
|
+
# @param set_name [String] the name of the set
|
141
|
+
# @return [Recon::Type::ReconType, nil] the configuration for the set name, or nil if not found
|
142
|
+
def self.find_recon_type set_name
|
143
|
+
return RECON_TYPES_MAP[set_name.to_sym] if RECON_TYPES_MAP.key? set_name.to_sym
|
144
|
+
|
145
|
+
raise "Unknown recon set_name: #{set_name.inspect}"
|
146
|
+
end
|
147
|
+
|
148
|
+
# Returns an array of paths to the CSV files for the given set name
|
149
|
+
#
|
150
|
+
# @param set_name [String] the name of the set
|
151
|
+
# @return [Array<String>] an array of paths to the CSV files
|
152
|
+
def self.csv_files set_name
|
153
|
+
set_config = find_set_config set_name
|
154
|
+
repo_paths = [set_config['repo_path']].flatten # ensure repo_path is an array
|
155
|
+
repo_paths.map { |path| File.join Recon.git_repo, path }
|
156
|
+
end
|
157
|
+
|
158
|
+
# Return and return the reconciliation data dictionary for the given set name.
|
159
|
+
#
|
160
|
+
# The hash keys are the concatenated key values for the reconciliation type (e.g., {Recon::Type::Names.get_key_values})
|
161
|
+
#
|
162
|
+
# @param set_name [String] the name of the set
|
163
|
+
# @return [Hash<String, Struct>] the reconciliation data
|
164
|
+
def self.load_set set_name
|
165
|
+
set_config = find_set_config set_name
|
166
|
+
recon_type = find_recon_type set_name
|
167
|
+
raise "No configured set found for: '#{set_name}'" unless set_config
|
168
|
+
|
169
|
+
data = {}
|
170
|
+
params = {
|
171
|
+
recon_type: recon_type,
|
172
|
+
data: data
|
173
|
+
}
|
174
|
+
|
175
|
+
# Path may be a single value or an array. Make sure it's an array.
|
176
|
+
csv_files(set_name).each do |csv_file|
|
177
|
+
params[:csv_file] = csv_file
|
178
|
+
validate! set_name, params[:csv_file]
|
179
|
+
read_csv **params
|
180
|
+
end
|
181
|
+
|
182
|
+
add_alt_keys data
|
183
|
+
data
|
184
|
+
end
|
185
|
+
|
186
|
+
# Read one CSV file and add the reconciliation data to the data hash
|
187
|
+
# and return the updated data hash.
|
188
|
+
#
|
189
|
+
# @param csv_file [String] the path to the CSV file
|
190
|
+
# @param recon_type [Recon::Type::ReconType] the reconciliation type
|
191
|
+
# @param data [Hash<String, Struct>] the reconciliation data
|
192
|
+
# @return [Hash<String, Struct>] the updated reconciliation data
|
193
|
+
def self.read_csv csv_file:, recon_type:, data:
|
194
|
+
CSV.foreach csv_file, headers: true do |row|
|
195
|
+
row = row.to_h.symbolize_keys
|
196
|
+
next if recon_type.lookup_values(row).blank?
|
197
|
+
struct = OpenStruct.new row.to_h
|
198
|
+
key = build_key recon_type.get_key_values row
|
199
|
+
data[key] = struct
|
200
|
+
end
|
201
|
+
data
|
202
|
+
end
|
203
|
+
|
204
|
+
# Validate an input or output recon CSV file for the given set name.
|
205
|
+
#
|
206
|
+
# Validates each row the CSV file for required column headers and
|
207
|
+
# values and for balanced columns. Required headers and values are
|
208
|
+
# defined in the Recon::Type::ReconType class, by
|
209
|
+
# {Recon::Type::ReconType.recon_csv_headers} and
|
210
|
+
# {Recon::Type::ReconType.balanced_headers}.
|
211
|
+
#
|
212
|
+
# @param set_name [String] the name of the set
|
213
|
+
# @param csv_file [String] the path to the CSV file
|
214
|
+
# @return [Array<String>] an array of errors
|
215
|
+
def self.validate set_name, csv_file
|
216
|
+
return unless RECON_VALIDATION_SETS.include? set_name
|
217
|
+
return "#{ERROR_CSV_FILE_NOT_FOUND}: '#{csv_file}'" unless File.exist? csv_file
|
218
|
+
|
219
|
+
recon_type = Recon.find_recon_type set_name
|
220
|
+
row_num = 0
|
221
|
+
CSV.readlines(csv_file, headers: true).map(&:to_h).filter_map { |row|
|
222
|
+
row.symbolize_keys!
|
223
|
+
error = validate_row recon_type, row, row_num+=1
|
224
|
+
error if error.present?
|
225
|
+
}
|
226
|
+
end
|
227
|
+
|
228
|
+
# Invoke {Recon.validate} for the input or output recon CSV file
|
229
|
+
# and raise an exception if there is an error.
|
230
|
+
#
|
231
|
+
# @param set_name [String] the name of the set
|
232
|
+
# @param csv_file [String] the path to the CSV file
|
233
|
+
# @return [void]
|
234
|
+
# @raise [DSError] if there is an error
|
235
|
+
def self.validate! set_name, csv_file
|
236
|
+
error = validate set_name, csv_file
|
237
|
+
return unless error.present?
|
238
|
+
|
239
|
+
raise DSError, "Error validating #{set_name} recon CSV #{csv_file}:\n#{error}"
|
240
|
+
end
|
241
|
+
|
242
|
+
# Validate one row of a CSV file for required column headers and values
|
243
|
+
# and for balanced columns.
|
244
|
+
#
|
245
|
+
# @param recon_type [Recon::Type::ReconType] the reconciliation type
|
246
|
+
# @param row [Hash] the row of data
|
247
|
+
# @param row_num [Integer] the row number used in error messages
|
248
|
+
# @return [Array<String>] an array of errors
|
249
|
+
def self.validate_row recon_type, row, row_num
|
250
|
+
errors = DS::Util::CsvValidator.validate_required_columns(row, required_columns: recon_type.recon_csv_headers, row_num: row_num)
|
251
|
+
raise DSError.new errors.join("\n") unless errors.blank?
|
252
|
+
DS::Util::CsvValidator.validate_balanced_columns(
|
253
|
+
row, balanced_columns: recon_type.balanced_columns, row_num: row_num
|
254
|
+
)
|
255
|
+
end
|
256
|
+
|
257
|
+
# Builds an alt key from key, splitting it into an array of values,
|
258
|
+
# invoking DS::Util::clean_string on each value and rejoining the
|
259
|
+
# cleaned values separated by '$$'.
|
260
|
+
#
|
261
|
+
# @param key [String] the key to be included in the alt key
|
262
|
+
# @return [String] the built alt key
|
263
|
+
def self.add_alt_keys data
|
264
|
+
data.keys.each do |key|
|
265
|
+
alt_key = build_alt_key key
|
266
|
+
next if data.include? alt_key
|
267
|
+
data[alt_key] = data[key]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
# Builds an alt key from key, splitting it into an array of values,
|
272
|
+
# invoking DS::Util::clean_string on each value and rejoining the
|
273
|
+
# cleaned values separated by '$$'.
|
274
|
+
#
|
275
|
+
# @param key [String] the key to be included in the alt key
|
276
|
+
# @return [String] the built alt key
|
277
|
+
def self.build_alt_key key
|
278
|
+
key.split('$$').map { |v|
|
279
|
+
DS::Util.clean_string v, terminator: ''
|
280
|
+
}.join '$$'
|
281
|
+
end
|
282
|
+
|
283
|
+
# Builds a key by concatenating the normalized Unicode representation of +values+,
|
284
|
+
# separated by '$$', and converts it to lowercase.
|
285
|
+
#
|
286
|
+
# @param values [Array<String>] the values to be included in the key
|
287
|
+
# @param subset [String] the subset to be included in the key
|
288
|
+
# @return [String] the built key
|
289
|
+
def self.build_key values
|
290
|
+
DS::Util.unicode_normalize values.select(&:present?).join('$$').downcase
|
291
|
+
end
|
292
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
##
|
6
|
+
# Encapsulates methods for caching and opening source files.
|
7
|
+
#
|
8
|
+
# This class includes the DS::Source::SourceCache module, but does
|
9
|
+
# not implement the +open_source+ method. Concrete subclasses of
|
10
|
+
# {DS::Source::BaseSource} must implement +open_source+.
|
11
|
+
#
|
12
|
+
class BaseSource
|
13
|
+
include DS::Source::SourceCache
|
14
|
+
|
15
|
+
def source_type
|
16
|
+
self.class::TYPE
|
17
|
+
end
|
18
|
+
|
19
|
+
# Loads the specified source path.
|
20
|
+
#
|
21
|
+
# @param source_path [String] The path to the source file.
|
22
|
+
# @return [Object] The parsed source file; e.g, Nokogiri::XML::Document or CSV::Table
|
23
|
+
def load_source source_path
|
24
|
+
find_or_open_source source_path
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
"#{self.class_name}: source_type #{source_type}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
class DSCSV < BaseSource
|
6
|
+
|
7
|
+
TYPE = DS::Constants::DS_CSV
|
8
|
+
|
9
|
+
# Opens a CSV file at the specified `source_file_path` and returns a CSV object.
|
10
|
+
#
|
11
|
+
# @param source_file_path [String] The path to the CSV file.
|
12
|
+
# @return [CSV] A CSV object representing the opened CSV file.
|
13
|
+
def open_source source_file_path
|
14
|
+
CSV.open(source_file_path, 'r', headers: true)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
class DSMetsXML < BaseSource
|
6
|
+
|
7
|
+
TYPE = DS::Constants::DS_METS
|
8
|
+
|
9
|
+
# Opens a METS XML file at the given path and returns it as a Nokogiri::XML object.
|
10
|
+
#
|
11
|
+
# Namespaces are *not* removed from the document.
|
12
|
+
#
|
13
|
+
# @param source_file_path [String] the path to the source file
|
14
|
+
# @return [Nokogiri::XML::Document] the contents of the source file as a Nokogiri::XML object
|
15
|
+
def open_source source_file_path
|
16
|
+
File.open(source_file_path) { |f| Nokogiri::XML f }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
class MarcXML < BaseSource
|
6
|
+
|
7
|
+
TYPE = DS::Constants::MARC_XML
|
8
|
+
|
9
|
+
# Opens a MARC XML source file at the given path and returns a Nokogiri::XML object representing the record.
|
10
|
+
#
|
11
|
+
# NB: Namespaces are stripped from the document.
|
12
|
+
#
|
13
|
+
# @param source_file_path [String] the path to the source file
|
14
|
+
# @return [Nokogiri::XML::Document] the MARC XML record
|
15
|
+
def open_source source_file_path
|
16
|
+
xml = File.open(source_file_path) { |f| Nokogiri::XML f }
|
17
|
+
xml.remove_namespaces!
|
18
|
+
xml
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
|
6
|
+
##
|
7
|
+
# This module provides methods for caching and opening source files.
|
8
|
+
# It is used by the DS::Mapper::BaseMapper class.
|
9
|
+
#
|
10
|
+
# It makes available a +#find_or_open_source+ method that can be used
|
11
|
+
# by the including class to open or retrieve a parse source file
|
12
|
+
# from the cache.
|
13
|
+
#
|
14
|
+
# Including classes must implement the +open_source+ method.
|
15
|
+
#
|
16
|
+
# The file +path+ is used as the cache key.
|
17
|
+
#
|
18
|
+
# The initial cache size is the value of DS::Util::Cache::DEFAULT_MAX_SIZE.
|
19
|
+
#
|
20
|
+
# Cache max size can be set and retrieved using the +max_cache_size+ and +max_cache_size=+ methods. #
|
21
|
+
module SourceCache
|
22
|
+
|
23
|
+
# Finds or opens a source file at the given path.
|
24
|
+
#
|
25
|
+
# @param source_path [String] the path to the source file
|
26
|
+
# @return [Object] the contents of the source file
|
27
|
+
def find_or_open_source source_path
|
28
|
+
return cache.get_item source_path if cache.include? source_path
|
29
|
+
source = open_source source_path
|
30
|
+
cache.add source_path, source
|
31
|
+
source
|
32
|
+
end
|
33
|
+
|
34
|
+
# Opens a source file at the given path.
|
35
|
+
#
|
36
|
+
# @param source_path [String] the path to the source file
|
37
|
+
# @return [Object] the contents of the source file
|
38
|
+
# @raise [NotImplementedError] unless implemented by including class
|
39
|
+
def open_source source_path
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the cache object.
|
44
|
+
#
|
45
|
+
# This method lazily initializes the cache object if it is not already initialized.
|
46
|
+
# The cache object is an instance of the DS::Util::Cache class.
|
47
|
+
#
|
48
|
+
# @return [DS::Util::Cache] the cache object
|
49
|
+
def cache
|
50
|
+
@cache ||= DS::Util::Cache.new
|
51
|
+
end
|
52
|
+
|
53
|
+
# Sets the maximum cache size.
|
54
|
+
#
|
55
|
+
# @param size [Integer] the maximum number of items to store in the cache
|
56
|
+
# @return [void]
|
57
|
+
def max_cache_size= size
|
58
|
+
cache.max_size = size
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the maximum cache size.
|
62
|
+
#
|
63
|
+
# @return [Integer] the maximum number of items to store in the cache
|
64
|
+
def max_cache_size
|
65
|
+
cache.max_size
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Source
|
5
|
+
class TeiXML < BaseSource
|
6
|
+
|
7
|
+
TYPE = DS::Constants::TEI_XML
|
8
|
+
|
9
|
+
# Opens a TEI XML file at the given path and returns it as a Nokogiri::XML object.
|
10
|
+
#
|
11
|
+
# NB: Namespaces are stripped from the document.
|
12
|
+
#
|
13
|
+
# @param source_file_path [String] the path to the source file
|
14
|
+
# @return [Nokogiri::XML::Document] the contents of the source file as a Nokogiri::XML object
|
15
|
+
def open_source source_file_path
|
16
|
+
xml = File.open(source_file_path) { |f| Nokogiri::XML f }
|
17
|
+
xml.remove_namespaces!
|
18
|
+
xml
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/ds/source.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'source/source_cache'
|
4
|
+
require_relative 'source/base_source'
|
5
|
+
require_relative 'source/marc_xml'
|
6
|
+
require_relative 'source/tei_xml'
|
7
|
+
require_relative 'source/ds_mets_xml'
|
8
|
+
require_relative 'source/ds_csv'
|
9
|
+
|
10
|
+
module DS
|
11
|
+
# DS Source module classes encapsulates the loading of source files.
|
12
|
+
# They are used by DS::Mapper classes and DS::Manifest id validator
|
13
|
+
# classes.
|
14
|
+
#
|
15
|
+
# A primary function of the DS::Source classes is to manage
|
16
|
+
# caching of source files, which may be expensive to load and parse; e.g.,
|
17
|
+
# MARC XML or CSV files with a large number of records.
|
18
|
+
module Source
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Util
|
5
|
+
class Cache
|
6
|
+
DEFAULT_MAX_SIZE = 10
|
7
|
+
UNLIMITED_SIZE = Float::INFINITY
|
8
|
+
|
9
|
+
attr_accessor :max_size
|
10
|
+
attr_reader :items
|
11
|
+
attr_reader :keys
|
12
|
+
|
13
|
+
# Initializes a new instance of the class with the specified maximum size.
|
14
|
+
#
|
15
|
+
# @param max_size [Integer] (DEFAULT_MAX_SIZE) the maximum size of the cache
|
16
|
+
# @return [void]
|
17
|
+
def initialize max_size: DEFAULT_MAX_SIZE
|
18
|
+
@max_size = max_size
|
19
|
+
@items = {}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds an item to the cache if it is not already present, or if the cache is not limited and the item is not already present.
|
23
|
+
#
|
24
|
+
# @param key [Object] the key used to identify the item in the cache
|
25
|
+
# @param item [Object] the item to be added to the cache
|
26
|
+
# @return [Object] the item that was added to the cache
|
27
|
+
def get_or_add key, item
|
28
|
+
add(key, item) unless include? key && unlimited?
|
29
|
+
get_item key
|
30
|
+
end
|
31
|
+
|
32
|
+
# Adds an item to the cache if it is not already present, or if the cache is not limited and the item is not already present.
|
33
|
+
#
|
34
|
+
# @param key [Object] the key used to identify the item in the cache
|
35
|
+
# @param item [Object] the item to be added to the cache
|
36
|
+
# @return [Object] the item that was added to the cache
|
37
|
+
def add key, item
|
38
|
+
delete_item key
|
39
|
+
items[key] = item
|
40
|
+
cleanup
|
41
|
+
item
|
42
|
+
end
|
43
|
+
|
44
|
+
# Checks if the given key is present in the cache.
|
45
|
+
#
|
46
|
+
# @param key [Object] the key to check for in the cache
|
47
|
+
# @return [Boolean] true if the key is present in the cache, false otherwise
|
48
|
+
def include? key
|
49
|
+
keys.include? key
|
50
|
+
end
|
51
|
+
|
52
|
+
# Checks if the cache is unlimited.
|
53
|
+
#
|
54
|
+
# @return [Boolean] true if the cache is unlimited, false otherwise
|
55
|
+
def unlimited?
|
56
|
+
max_size == UNLIMITED_SIZE
|
57
|
+
end
|
58
|
+
|
59
|
+
# Retrieves an item from the cache using the specified key.
|
60
|
+
#
|
61
|
+
# @param key [Object] The key used to identify the item in the cache.
|
62
|
+
# @return [Object] The item associated with the specified key, or nil if the key is not present in the cache.
|
63
|
+
def get_item key
|
64
|
+
items[key]
|
65
|
+
end
|
66
|
+
|
67
|
+
# Retrieves an item from the cache using the specified key.
|
68
|
+
#
|
69
|
+
# @param key [Object] The key used to identify the item in the cache.
|
70
|
+
# @return [Object] The item associated with the specified key, or nil if the key is not present in the cache.
|
71
|
+
def [](key)
|
72
|
+
get_item key
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns an array of all the keys in the cache.
|
76
|
+
#
|
77
|
+
# @return [Array<Object>] An array of keys.
|
78
|
+
def keys
|
79
|
+
items.keys
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns the number of items in the cache.
|
83
|
+
#
|
84
|
+
# @return [Integer] The number of items in the cache.
|
85
|
+
def size
|
86
|
+
keys.size
|
87
|
+
end
|
88
|
+
|
89
|
+
# Deletes an item from the cache using the specified key.
|
90
|
+
#
|
91
|
+
# @param key [Object] The key used to identify the item in the cache.
|
92
|
+
# @return [void]
|
93
|
+
def delete_item key
|
94
|
+
items.delete key
|
95
|
+
end
|
96
|
+
|
97
|
+
# Cleanup the cache by removing items until the size is less than or equal to the maximum size.
|
98
|
+
#
|
99
|
+
# This method does not take any parameters.
|
100
|
+
#
|
101
|
+
# @return [void]
|
102
|
+
def cleanup
|
103
|
+
return if size < max_size
|
104
|
+
return if keys.blank? # don't allow an infinite loop
|
105
|
+
while size > max_size
|
106
|
+
delete_item keys.first
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|