ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Converter
5
+ ##
6
+ # The DS Converter is responsible for generating the import
7
+ # spreadsheet for a set of data. Its work is driven by a Manifest
8
+ # CSV represented by a DS::Manifest::Manifest instance. Each row of
9
+ # the CSV is represented by a DS::Manifest::Entry instance.
10
+ #
11
+ # The DS Converter does the following:
12
+ #
13
+ # 1. Reads each entry from the Manifest CSV
14
+ # 2. Selects a Mapper type based on the source data type
15
+ # 3. Assembles the data needed for mapping
16
+ # 4. Maps each record to the data hash, assembling all the
17
+ # data hashes needed for the import CSV
18
+ # 5. Returns the assembled hashes to the caller
19
+ class Converter
20
+ include Enumerable
21
+
22
+ attr_reader :manifest
23
+ attr_reader :timestamp
24
+ attr_reader :source_dir
25
+ attr_reader :mapper_cache
26
+
27
+ ##
28
+ # @param [DS::CSV] manifest the Manifest instance
29
+ def initialize manifest
30
+ @manifest = manifest
31
+ @timestamp = Time.now
32
+ @source_dir = manifest.source_dir
33
+ @mapper_cache = DS::Util::Cache.new
34
+ @errors = []
35
+ end
36
+
37
+ ##
38
+ # @yieldparam [Hash<String,String>] the import CSV hash of data
39
+ # for each record
40
+ # @return [Array<Hash<String,String>>] the array of all import CSV
41
+ # hashes for the provided manifest
42
+ def convert &block
43
+ data = []
44
+ each_with_index do |entry, index|
45
+ mapper = find_or_create_mapper entry, timestamp
46
+ hash = mapper.map_record entry
47
+ data << hash
48
+ validate_row index + 1, hash
49
+ yield hash if block_given?
50
+ end
51
+ data
52
+ end
53
+
54
+ ##
55
+ # @param [Integer] row_num the row number
56
+ # @param [Hash <Symbol,String>] row the row data
57
+ # @return [void]
58
+ def validate_row row_num, row
59
+ @errors += DS::Util::CsvValidator.validate_whitespace(
60
+ row,
61
+ row_num: row_num,
62
+ nested_columns: DS::Constants::NESTED_COLUMNS
63
+ )
64
+ end
65
+
66
+ # Checks if there are any errors in the CSV.
67
+ #
68
+ # @return [Boolean] Returns true if there are no errors, false otherwise.
69
+ def csv_valid?
70
+ errors.empty?
71
+ end
72
+
73
+ # Returns a duplicate of the array of errors.
74
+ #
75
+ # @return [Array<String>] a duplicate of the array of errors
76
+ def errors
77
+ @errors.dup
78
+ end
79
+
80
+
81
+ ##
82
+ # @yieldparam [DS::Manifest::Entry] entry the manifest line item
83
+ # for each record
84
+ def each &block
85
+ manifest.each do |entry|
86
+ yield entry
87
+ end
88
+ end
89
+
90
+ def find_or_create_mapper entry, tstamp
91
+ key = mapper_key entry
92
+ return mapper_cache.get_item key if mapper_cache.include? key
93
+ mapper = create_mapper entry, tstamp
94
+ mapper_cache.add key, mapper
95
+ mapper
96
+ end
97
+
98
+ def create_mapper entry, tstamp
99
+ case entry.source_type
100
+ when DS::Constants::MARC_XML
101
+ DS::Mapper::MarcMapper.new source_dir: source_dir, timestamp: tstamp
102
+ when DS::Constants::TEI_XML
103
+ DS::Mapper::TeiXmlMapper.new source_dir: source_dir, timestamp: tstamp
104
+ when DS::Constants::DS_METS
105
+ DS::Mapper::DSMetsMapper.new source_dir: source_dir, timestamp: tstamp
106
+ when DS::Constants::DS_CSV
107
+ DS::Mapper::DSCSVMapper.new source_dir: source_dir, timestamp: tstamp
108
+ else
109
+ raise NotImplementedError.new(
110
+ "Mapper not implemented for source type: '#{entry.source_type}'"
111
+ )
112
+ end
113
+ end
114
+
115
+ def source_file_path entry
116
+ File.join source_dir, entry.filename
117
+ end
118
+
119
+ def mapper_key entry
120
+ { source_type: entry.source_type, manifest_path: manifest.csv_path }
121
+ end
122
+ end # class Converter
123
+ end # module Converter
124
+ end # module DS
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Converter
5
+ class Writer
6
+
7
+ attr_reader :count
8
+ attr_reader :output
9
+ attr_reader :validator
10
+
11
+
12
+ def initializer output_io, row_validator
13
+ @output = output_io
14
+ @count = 1
15
+ @validator = row_validator
16
+ @valid = true
17
+ @errors = []
18
+ end
19
+
20
+ def write
21
+ converter.convert do |row|
22
+ CSV.open output, "w", headers: true do |csv|
23
+ csv << DS::HEADINGS if count == 1
24
+ count += 1
25
+ validate_row count, row
26
+ csv << row
27
+ end
28
+ end
29
+ end
30
+
31
+ def valid?
32
+ errors.blank?
33
+ end
34
+
35
+ def validate_row row_num, row
36
+ error = validator.row_valid? row
37
+ return unless row
38
+ add_error row_num, error
39
+ end
40
+
41
+ def add_error row_num, error
42
+ @errors << [row_num, error]
43
+ end
44
+
45
+ def errors
46
+ @errors.dup
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+ require_relative 'converter/converter'
3
+
4
+ module DS
5
+ module Converter
6
+ end
7
+ end
@@ -0,0 +1,43 @@
1
+ require 'csv'
2
+
3
+ module DS
4
+ module CSVUtil
5
+ module ClassMethods
6
+ # TODO: These methods don't belong in CSVUtil; find them a new home
7
+ # TODO: Remove CSVUtil when the above TODO is complete
8
+ # Columns with two levels of subfields, separated by '|' and ';'
9
+ NESTED_COLUMNS = %w{ subject subject_label genre genre_label production_place production_place_label language language_label }
10
+ ##
11
+ # Check all rows for validation errors, including:
12
+ #
13
+ # - trailing spaces in values
14
+ #
15
+ # @param [Array<Hash>] rows the CSV rows
16
+ # @return [Boolean]
17
+ def validate rows
18
+ valid = true
19
+ rows.each_with_index do |row,index|
20
+ valid = false unless row_valid? row, index
21
+ end
22
+ valid
23
+ end
24
+
25
+ # split on pipes that are not escaped with '\'
26
+ PIPE_SPLIT_REGEXP = %r{(?<!\\)\|}
27
+ # split on pipes and semicolons that are not escaped with '\'
28
+ PIPE_SEMICOLON_REGEXP = %r{(?<!\\)[;|]}
29
+
30
+
31
+ def row_valid? row, index
32
+ valid = true
33
+ DS::Util::CsvValidator.validate_whitespace(row, row_num: index, nested_columns: NESTED_COLUMNS).each do |error|
34
+ valid = false
35
+ STDERR.puts "WARNING: #{error}"
36
+ end
37
+ valid
38
+ end
39
+ end
40
+
41
+ self.extend ClassMethods
42
+ end
43
+ end