ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
data/lib/ds/recon.rb ADDED
@@ -0,0 +1,292 @@
1
+ require_relative 'util'
2
+ require_relative 'recon/source_enumerator'
3
+ require_relative 'recon/ds_csv_enumerator'
4
+ require_relative 'recon/marc_xml_enumerator'
5
+ require_relative 'recon/tei_xml_enumerator'
6
+ require_relative 'recon/ds_mets_xml_enumerator'
7
+ require_relative 'recon/url_lookup'
8
+ require_relative 'recon/type/recon_type'
9
+ require_relative 'recon/type/genres'
10
+ require_relative 'recon/type/languages'
11
+ require_relative 'recon/type/materials'
12
+ require_relative 'recon/type/names'
13
+ require_relative 'recon/type/places'
14
+ require_relative 'recon/type/subjects'
15
+ require_relative 'recon/type/splits'
16
+ require_relative 'recon/type/named_subjects'
17
+ require_relative 'recon/type/all_subjects'
18
+ require_relative 'recon/type/titles'
19
+ require_relative 'recon/recon_builder'
20
+ require_relative 'recon/recon_manager'
21
+ require_relative 'constants'
22
+ require 'logger'
23
+ require 'csv'
24
+ require 'ostruct'
25
+
26
+ # The DS Recon module contains classes and methods for working with DS
27
+ # recon data dictionaries.
28
+ #
29
+ # The classes in this module manage and support all the following:
30
+ #
31
+ # - The loading recon data dictionary CSV files for recon lookups
32
+ # - The generation of recon CSV files from import sources
33
+ # - The addition of recon data to import CSVs
34
+ #
35
+ # The key modules and classes in the Recon module are:
36
+ #
37
+ # - {Recon} -- validation and loading of recon data dictionary CSVs; data dictionary lookups; retrieval and updates of the DS data git repository, which includes the data dictionary CSVs
38
+ # - {Recon::Type} -- recon type configurations used for lookups, extractions, and column mappings
39
+ # - {Recon::ReconManager} -- the main interface for the Recon module; used to build and write recon CSVs
40
+ # - {Recon::ReconBuilder} -- used by the Recon::Manager to build recon values hashes by extracting DS::Extractor::BaseTerm instances from source records and performing lookups
41
+ # - {Recon::SourceEnumerator} instances -- used by Recon::ReconBuilder to iterate over source records
42
+ #
43
+ # @example
44
+ # require 'ds'
45
+ # # write the places.csv file for a set of MARC XML files
46
+ # files = Dir['source/files/*.xml']
47
+ # recon_manager = Recon::ReconManager.new(
48
+ # source_type: 'marc-xml',
49
+ # out_dir: 'path/to/dir',
50
+ # files: files
51
+ # )
52
+ # recon_type = Recon.find_recon_type :places
53
+ # recon_manager.write_csv recon_type # => 'path/to/dir/places.csv'
54
+ #
55
+ #
56
+ module Recon
57
+
58
+ ERROR_UNBALANCED_SUBFIELDS = 'Row has unmatched subfields'
59
+ ERROR_BLANK_SUBFIELDS = 'Row has blank subfields'
60
+ ERROR_MISSING_REQUIRED_COLUMNS = "CSV is missing required column(s)"
61
+ ERROR_CSV_FILE_NOT_FOUND = 'Recon CSV file cannot be found'
62
+
63
+
64
+ RECON_SETS = %i{
65
+ genres
66
+ languages
67
+ materials
68
+ named-subjects
69
+ names
70
+ places
71
+ subjects
72
+ titles
73
+ }
74
+
75
+
76
+ RECON_TYPES_MAP = {
77
+ :genres => Recon::Type::Genres,
78
+ :languages => Recon::Type::Languages,
79
+ :materials => Recon::Type::Materials,
80
+ :'all-subjects' => Recon::Type::Subjects,
81
+ :'named-subjects' => Recon::Type::NamedSubjects,
82
+ :names => Recon::Type::Names,
83
+ :places => Recon::Type::Places,
84
+ :subjects => Recon::Type::Subjects,
85
+ :titles => Recon::Type::Titles,
86
+ :splits => Recon::Type::Splits
87
+ }.freeze
88
+
89
+ RECON_TYPES = RECON_TYPES_MAP.values.freeze
90
+
91
+ RECON_VALIDATION_SETS = RECON_TYPES.map(&:set_name).freeze
92
+
93
+
94
+ # For the recon data dictionary with +set_name+, find the value in the +column+ with the
95
+ # key +values.join('$$')+.
96
+ #
97
+ # @param [String] set_name the name of the set to look up
98
+ # @param [String] values the lookup key values
99
+ # @param [Symbol] column the column value to retrieve
100
+ # @return [Object, nil] the value found in the specified column, or nil if not found
101
+ def self.lookup_single set_name, key_values:, column:
102
+ recon_set = find_set set_name
103
+ key = build_key key_values
104
+ return recon_set.dig key, column if recon_set.include? key
105
+
106
+ # try a key with a "cleaned" string
107
+ alt_key = build_alt_key key
108
+ recon_set.dig(alt_key, column)
109
+ end
110
+
111
+ # Return the set of terms for the given set name.
112
+ #
113
+ # @param set_name [String] the name of the set
114
+ # @return [Hash<String, Struct>, nil] the set of terms, with keys as values and term structs as values
115
+ def self.find_set set_name
116
+ @@reconciliations ||= {}
117
+ @@reconciliations[set_name] ||= load_set set_name
118
+ end
119
+
120
+ # The path to the DS data git repository
121
+ #
122
+ # @return [String] the path to the DS data git repository
123
+ def self.git_repo
124
+ File.join Settings.recon.local_dir, Settings.recon.git_local_name
125
+ end
126
+
127
+ # Finds the +config/settings.yml+ configuration for the given set name.
128
+ #
129
+ # @param set_name [String] the name of the set
130
+ # @return [Hash<String, Object>] the configuration for the set name, or nil if not found
131
+ # @raise [DSError] if the set name is not found
132
+ def self.find_set_config name
133
+ config = Settings.recon.sets.find { |s| s.name == name }
134
+ raise DSError, "Unknown set name: #{name.inspect}" unless config
135
+ config
136
+ end
137
+
138
+ # Finds the reconciliation type configuration for the given set name.
139
+ #
140
+ # @param set_name [String] the name of the set
141
+ # @return [Recon::Type::ReconType, nil] the configuration for the set name, or nil if not found
142
+ def self.find_recon_type set_name
143
+ return RECON_TYPES_MAP[set_name.to_sym] if RECON_TYPES_MAP.key? set_name.to_sym
144
+
145
+ raise "Unknown recon set_name: #{set_name.inspect}"
146
+ end
147
+
148
+ # Returns an array of paths to the CSV files for the given set name
149
+ #
150
+ # @param set_name [String] the name of the set
151
+ # @return [Array<String>] an array of paths to the CSV files
152
+ def self.csv_files set_name
153
+ set_config = find_set_config set_name
154
+ repo_paths = [set_config['repo_path']].flatten # ensure repo_path is an array
155
+ repo_paths.map { |path| File.join Recon.git_repo, path }
156
+ end
157
+
158
+ # Return and return the reconciliation data dictionary for the given set name.
159
+ #
160
+ # The hash keys are the concatenated key values for the reconciliation type (e.g., {Recon::Type::Names.get_key_values})
161
+ #
162
+ # @param set_name [String] the name of the set
163
+ # @return [Hash<String, Struct>] the reconciliation data
164
+ def self.load_set set_name
165
+ set_config = find_set_config set_name
166
+ recon_type = find_recon_type set_name
167
+ raise "No configured set found for: '#{set_name}'" unless set_config
168
+
169
+ data = {}
170
+ params = {
171
+ recon_type: recon_type,
172
+ data: data
173
+ }
174
+
175
+ # Path may be a single value or an array. Make sure it's an array.
176
+ csv_files(set_name).each do |csv_file|
177
+ params[:csv_file] = csv_file
178
+ validate! set_name, params[:csv_file]
179
+ read_csv **params
180
+ end
181
+
182
+ add_alt_keys data
183
+ data
184
+ end
185
+
186
+ # Read one CSV file and add the reconciliation data to the data hash
187
+ # and return the updated data hash.
188
+ #
189
+ # @param csv_file [String] the path to the CSV file
190
+ # @param recon_type [Recon::Type::ReconType] the reconciliation type
191
+ # @param data [Hash<String, Struct>] the reconciliation data
192
+ # @return [Hash<String, Struct>] the updated reconciliation data
193
+ def self.read_csv csv_file:, recon_type:, data:
194
+ CSV.foreach csv_file, headers: true do |row|
195
+ row = row.to_h.symbolize_keys
196
+ next if recon_type.lookup_values(row).blank?
197
+ struct = OpenStruct.new row.to_h
198
+ key = build_key recon_type.get_key_values row
199
+ data[key] = struct
200
+ end
201
+ data
202
+ end
203
+
204
+ # Validate an input or output recon CSV file for the given set name.
205
+ #
206
+ # Validates each row the CSV file for required column headers and
207
+ # values and for balanced columns. Required headers and values are
208
+ # defined in the Recon::Type::ReconType class, by
209
+ # {Recon::Type::ReconType.recon_csv_headers} and
210
+ # {Recon::Type::ReconType.balanced_headers}.
211
+ #
212
+ # @param set_name [String] the name of the set
213
+ # @param csv_file [String] the path to the CSV file
214
+ # @return [Array<String>] an array of errors
215
+ def self.validate set_name, csv_file
216
+ return unless RECON_VALIDATION_SETS.include? set_name
217
+ return "#{ERROR_CSV_FILE_NOT_FOUND}: '#{csv_file}'" unless File.exist? csv_file
218
+
219
+ recon_type = Recon.find_recon_type set_name
220
+ row_num = 0
221
+ CSV.readlines(csv_file, headers: true).map(&:to_h).filter_map { |row|
222
+ row.symbolize_keys!
223
+ error = validate_row recon_type, row, row_num+=1
224
+ error if error.present?
225
+ }
226
+ end
227
+
228
+ # Invoke {Recon.validate} for the input or output recon CSV file
229
+ # and raise an exception if there is an error.
230
+ #
231
+ # @param set_name [String] the name of the set
232
+ # @param csv_file [String] the path to the CSV file
233
+ # @return [void]
234
+ # @raise [DSError] if there is an error
235
+ def self.validate! set_name, csv_file
236
+ error = validate set_name, csv_file
237
+ return unless error.present?
238
+
239
+ raise DSError, "Error validating #{set_name} recon CSV #{csv_file}:\n#{error}"
240
+ end
241
+
242
+ # Validate one row of a CSV file for required column headers and values
243
+ # and for balanced columns.
244
+ #
245
+ # @param recon_type [Recon::Type::ReconType] the reconciliation type
246
+ # @param row [Hash] the row of data
247
+ # @param row_num [Integer] the row number used in error messages
248
+ # @return [Array<String>] an array of errors
249
+ def self.validate_row recon_type, row, row_num
250
+ errors = DS::Util::CsvValidator.validate_required_columns(row, required_columns: recon_type.recon_csv_headers, row_num: row_num)
251
+ raise DSError.new errors.join("\n") unless errors.blank?
252
+ DS::Util::CsvValidator.validate_balanced_columns(
253
+ row, balanced_columns: recon_type.balanced_columns, row_num: row_num
254
+ )
255
+ end
256
+
257
+ # Builds an alt key from key, splitting it into an array of values,
258
+ # invoking DS::Util::clean_string on each value and rejoining the
259
+ # cleaned values separated by '$$'.
260
+ #
261
+ # @param key [String] the key to be included in the alt key
262
+ # @return [String] the built alt key
263
+ def self.add_alt_keys data
264
+ data.keys.each do |key|
265
+ alt_key = build_alt_key key
266
+ next if data.include? alt_key
267
+ data[alt_key] = data[key]
268
+ end
269
+ end
270
+
271
+ # Builds an alt key from key, splitting it into an array of values,
272
+ # invoking DS::Util::clean_string on each value and rejoining the
273
+ # cleaned values separated by '$$'.
274
+ #
275
+ # @param key [String] the key to be included in the alt key
276
+ # @return [String] the built alt key
277
+ def self.build_alt_key key
278
+ key.split('$$').map { |v|
279
+ DS::Util.clean_string v, terminator: ''
280
+ }.join '$$'
281
+ end
282
+
283
+ # Builds a key by concatenating the normalized Unicode representation of +values+,
284
+ # separated by '$$', and converts it to lowercase.
285
+ #
286
+ # @param values [Array<String>] the values to be included in the key
287
+ # @param subset [String] the subset to be included in the key
288
+ # @return [String] the built key
289
+ def self.build_key values
290
+ DS::Util.unicode_normalize values.select(&:present?).join('$$').downcase
291
+ end
292
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+ ##
6
+ # Encapsulates methods for caching and opening source files.
7
+ #
8
+ # This class includes the DS::Source::SourceCache module, but does
9
+ # not implement the +open_source+ method. Concrete subclasses of
10
+ # {DS::Source::BaseSource} must implement +open_source+.
11
+ #
12
+ class BaseSource
13
+ include DS::Source::SourceCache
14
+
15
+ def source_type
16
+ self.class::TYPE
17
+ end
18
+
19
+ # Loads the specified source path.
20
+ #
21
+ # @param source_path [String] The path to the source file.
22
+ # @return [Object] The parsed source file; e.g, Nokogiri::XML::Document or CSV::Table
23
+ def load_source source_path
24
+ find_or_open_source source_path
25
+ end
26
+
27
+ def to_s
28
+ "#{self.class_name}: source_type #{source_type}"
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+ class DSCSV < BaseSource
6
+
7
+ TYPE = DS::Constants::DS_CSV
8
+
9
+ # Opens a CSV file at the specified `source_file_path` and returns a CSV object.
10
+ #
11
+ # @param source_file_path [String] The path to the CSV file.
12
+ # @return [CSV] A CSV object representing the opened CSV file.
13
+ def open_source source_file_path
14
+ CSV.open(source_file_path, 'r', headers: true)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+ class DSMetsXML < BaseSource
6
+
7
+ TYPE = DS::Constants::DS_METS
8
+
9
+ # Opens a METS XML file at the given path and returns it as a Nokogiri::XML object.
10
+ #
11
+ # Namespaces are *not* removed from the document.
12
+ #
13
+ # @param source_file_path [String] the path to the source file
14
+ # @return [Nokogiri::XML::Document] the contents of the source file as a Nokogiri::XML object
15
+ def open_source source_file_path
16
+ File.open(source_file_path) { |f| Nokogiri::XML f }
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+ class MarcXML < BaseSource
6
+
7
+ TYPE = DS::Constants::MARC_XML
8
+
9
+ # Opens a MARC XML source file at the given path and returns a Nokogiri::XML object representing the record.
10
+ #
11
+ # NB: Namespaces are stripped from the document.
12
+ #
13
+ # @param source_file_path [String] the path to the source file
14
+ # @return [Nokogiri::XML::Document] the MARC XML record
15
+ def open_source source_file_path
16
+ xml = File.open(source_file_path) { |f| Nokogiri::XML f }
17
+ xml.remove_namespaces!
18
+ xml
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+
6
+ ##
7
+ # This module provides methods for caching and opening source files.
8
+ # It is used by the DS::Mapper::BaseMapper class.
9
+ #
10
+ # It makes available a +#find_or_open_source+ method that can be used
11
+ # by the including class to open or retrieve a parse source file
12
+ # from the cache.
13
+ #
14
+ # Including classes must implement the +open_source+ method.
15
+ #
16
+ # The file +path+ is used as the cache key.
17
+ #
18
+ # The initial cache size is the value of DS::Util::Cache::DEFAULT_MAX_SIZE.
19
+ #
20
+ # Cache max size can be set and retrieved using the +max_cache_size+ and +max_cache_size=+ methods. #
21
+ module SourceCache
22
+
23
+ # Finds or opens a source file at the given path.
24
+ #
25
+ # @param source_path [String] the path to the source file
26
+ # @return [Object] the contents of the source file
27
+ def find_or_open_source source_path
28
+ return cache.get_item source_path if cache.include? source_path
29
+ source = open_source source_path
30
+ cache.add source_path, source
31
+ source
32
+ end
33
+
34
+ # Opens a source file at the given path.
35
+ #
36
+ # @param source_path [String] the path to the source file
37
+ # @return [Object] the contents of the source file
38
+ # @raise [NotImplementedError] unless implemented by including class
39
+ def open_source source_path
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Returns the cache object.
44
+ #
45
+ # This method lazily initializes the cache object if it is not already initialized.
46
+ # The cache object is an instance of the DS::Util::Cache class.
47
+ #
48
+ # @return [DS::Util::Cache] the cache object
49
+ def cache
50
+ @cache ||= DS::Util::Cache.new
51
+ end
52
+
53
+ # Sets the maximum cache size.
54
+ #
55
+ # @param size [Integer] the maximum number of items to store in the cache
56
+ # @return [void]
57
+ def max_cache_size= size
58
+ cache.max_size = size
59
+ end
60
+
61
+ # Returns the maximum cache size.
62
+ #
63
+ # @return [Integer] the maximum number of items to store in the cache
64
+ def max_cache_size
65
+ cache.max_size
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Source
5
+ class TeiXML < BaseSource
6
+
7
+ TYPE = DS::Constants::TEI_XML
8
+
9
+ # Opens a TEI XML file at the given path and returns it as a Nokogiri::XML object.
10
+ #
11
+ # NB: Namespaces are stripped from the document.
12
+ #
13
+ # @param source_file_path [String] the path to the source file
14
+ # @return [Nokogiri::XML::Document] the contents of the source file as a Nokogiri::XML object
15
+ def open_source source_file_path
16
+ xml = File.open(source_file_path) { |f| Nokogiri::XML f }
17
+ xml.remove_namespaces!
18
+ xml
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/ds/source.rb ADDED
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'source/source_cache'
4
+ require_relative 'source/base_source'
5
+ require_relative 'source/marc_xml'
6
+ require_relative 'source/tei_xml'
7
+ require_relative 'source/ds_mets_xml'
8
+ require_relative 'source/ds_csv'
9
+
10
+ module DS
11
+ # DS Source module classes encapsulates the loading of source files.
12
+ # They are used by DS::Mapper classes and DS::Manifest id validator
13
+ # classes.
14
+ #
15
+ # A primary function of the DS::Source classes is to manage
16
+ # caching of source files, which may be expensive to load and parse; e.g.,
17
+ # MARC XML or CSV files with a large number of records.
18
+ module Source
19
+ end
20
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Util
5
+ class Cache
6
+ DEFAULT_MAX_SIZE = 10
7
+ UNLIMITED_SIZE = Float::INFINITY
8
+
9
+ attr_accessor :max_size
10
+ attr_reader :items
11
+ attr_reader :keys
12
+
13
+ # Initializes a new instance of the class with the specified maximum size.
14
+ #
15
+ # @param max_size [Integer] (DEFAULT_MAX_SIZE) the maximum size of the cache
16
+ # @return [void]
17
+ def initialize max_size: DEFAULT_MAX_SIZE
18
+ @max_size = max_size
19
+ @items = {}
20
+ end
21
+
22
+ # Adds an item to the cache if it is not already present, or if the cache is not limited and the item is not already present.
23
+ #
24
+ # @param key [Object] the key used to identify the item in the cache
25
+ # @param item [Object] the item to be added to the cache
26
+ # @return [Object] the item that was added to the cache
27
+ def get_or_add key, item
28
+ add(key, item) unless include? key && unlimited?
29
+ get_item key
30
+ end
31
+
32
+ # Adds an item to the cache if it is not already present, or if the cache is not limited and the item is not already present.
33
+ #
34
+ # @param key [Object] the key used to identify the item in the cache
35
+ # @param item [Object] the item to be added to the cache
36
+ # @return [Object] the item that was added to the cache
37
+ def add key, item
38
+ delete_item key
39
+ items[key] = item
40
+ cleanup
41
+ item
42
+ end
43
+
44
+ # Checks if the given key is present in the cache.
45
+ #
46
+ # @param key [Object] the key to check for in the cache
47
+ # @return [Boolean] true if the key is present in the cache, false otherwise
48
+ def include? key
49
+ keys.include? key
50
+ end
51
+
52
+ # Checks if the cache is unlimited.
53
+ #
54
+ # @return [Boolean] true if the cache is unlimited, false otherwise
55
+ def unlimited?
56
+ max_size == UNLIMITED_SIZE
57
+ end
58
+
59
+ # Retrieves an item from the cache using the specified key.
60
+ #
61
+ # @param key [Object] The key used to identify the item in the cache.
62
+ # @return [Object] The item associated with the specified key, or nil if the key is not present in the cache.
63
+ def get_item key
64
+ items[key]
65
+ end
66
+
67
+ # Retrieves an item from the cache using the specified key.
68
+ #
69
+ # @param key [Object] The key used to identify the item in the cache.
70
+ # @return [Object] The item associated with the specified key, or nil if the key is not present in the cache.
71
+ def [](key)
72
+ get_item key
73
+ end
74
+
75
+ # Returns an array of all the keys in the cache.
76
+ #
77
+ # @return [Array<Object>] An array of keys.
78
+ def keys
79
+ items.keys
80
+ end
81
+
82
+ # Returns the number of items in the cache.
83
+ #
84
+ # @return [Integer] The number of items in the cache.
85
+ def size
86
+ keys.size
87
+ end
88
+
89
+ # Deletes an item from the cache using the specified key.
90
+ #
91
+ # @param key [Object] The key used to identify the item in the cache.
92
+ # @return [void]
93
+ def delete_item key
94
+ items.delete key
95
+ end
96
+
97
+ # Cleanup the cache by removing items until the size is less than or equal to the maximum size.
98
+ #
99
+ # This method does not take any parameters.
100
+ #
101
+ # @return [void]
102
+ def cleanup
103
+ return if size < max_size
104
+ return if keys.blank? # don't allow an infinite loop
105
+ while size > max_size
106
+ delete_item keys.first
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end