ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Converter
|
5
|
+
##
|
6
|
+
# The DS Converter is responsible for generating the import
|
7
|
+
# spreadsheet for a set of data. Its work is driven by a Manifest
|
8
|
+
# CSV represented by a DS::Manifest::Manifest instance. Each row of
|
9
|
+
# the CSV is represented by a DS::Manifest::Entry instance.
|
10
|
+
#
|
11
|
+
# The DS Converter does the following:
|
12
|
+
#
|
13
|
+
# 1. Reads each entry from the Manifest CSV
|
14
|
+
# 2. Selects a Mapper type based on the source data type
|
15
|
+
# 3. Assembles the data needed for mapping
|
16
|
+
# 4. Maps each record to the data hash, assembling all the
|
17
|
+
# data hashes needed for the import CSV
|
18
|
+
# 5. Returns the assembled hashes to the caller
|
19
|
+
class Converter
|
20
|
+
include Enumerable
|
21
|
+
|
22
|
+
attr_reader :manifest
|
23
|
+
attr_reader :timestamp
|
24
|
+
attr_reader :source_dir
|
25
|
+
attr_reader :mapper_cache
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param [DS::CSV] manifest the Manifest instance
|
29
|
+
def initialize manifest
|
30
|
+
@manifest = manifest
|
31
|
+
@timestamp = Time.now
|
32
|
+
@source_dir = manifest.source_dir
|
33
|
+
@mapper_cache = DS::Util::Cache.new
|
34
|
+
@errors = []
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# @yieldparam [Hash<String,String>] the import CSV hash of data
|
39
|
+
# for each record
|
40
|
+
# @return [Array<Hash<String,String>>] the array of all import CSV
|
41
|
+
# hashes for the provided manifest
|
42
|
+
def convert &block
|
43
|
+
data = []
|
44
|
+
each_with_index do |entry, index|
|
45
|
+
mapper = find_or_create_mapper entry, timestamp
|
46
|
+
hash = mapper.map_record entry
|
47
|
+
data << hash
|
48
|
+
validate_row index + 1, hash
|
49
|
+
yield hash if block_given?
|
50
|
+
end
|
51
|
+
data
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @param [Integer] row_num the row number
|
56
|
+
# @param [Hash <Symbol,String>] row the row data
|
57
|
+
# @return [void]
|
58
|
+
def validate_row row_num, row
|
59
|
+
@errors += DS::Util::CsvValidator.validate_whitespace(
|
60
|
+
row,
|
61
|
+
row_num: row_num,
|
62
|
+
nested_columns: DS::Constants::NESTED_COLUMNS
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Checks if there are any errors in the CSV.
|
67
|
+
#
|
68
|
+
# @return [Boolean] Returns true if there are no errors, false otherwise.
|
69
|
+
def csv_valid?
|
70
|
+
errors.empty?
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns a duplicate of the array of errors.
|
74
|
+
#
|
75
|
+
# @return [Array<String>] a duplicate of the array of errors
|
76
|
+
def errors
|
77
|
+
@errors.dup
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
##
|
82
|
+
# @yieldparam [DS::Manifest::Entry] entry the manifest line item
|
83
|
+
# for each record
|
84
|
+
def each &block
|
85
|
+
manifest.each do |entry|
|
86
|
+
yield entry
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def find_or_create_mapper entry, tstamp
|
91
|
+
key = mapper_key entry
|
92
|
+
return mapper_cache.get_item key if mapper_cache.include? key
|
93
|
+
mapper = create_mapper entry, tstamp
|
94
|
+
mapper_cache.add key, mapper
|
95
|
+
mapper
|
96
|
+
end
|
97
|
+
|
98
|
+
def create_mapper entry, tstamp
|
99
|
+
case entry.source_type
|
100
|
+
when DS::Constants::MARC_XML
|
101
|
+
DS::Mapper::MarcMapper.new source_dir: source_dir, timestamp: tstamp
|
102
|
+
when DS::Constants::TEI_XML
|
103
|
+
DS::Mapper::TeiXmlMapper.new source_dir: source_dir, timestamp: tstamp
|
104
|
+
when DS::Constants::DS_METS
|
105
|
+
DS::Mapper::DSMetsMapper.new source_dir: source_dir, timestamp: tstamp
|
106
|
+
when DS::Constants::DS_CSV
|
107
|
+
DS::Mapper::DSCSVMapper.new source_dir: source_dir, timestamp: tstamp
|
108
|
+
else
|
109
|
+
raise NotImplementedError.new(
|
110
|
+
"Mapper not implemented for source type: '#{entry.source_type}'"
|
111
|
+
)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def source_file_path entry
|
116
|
+
File.join source_dir, entry.filename
|
117
|
+
end
|
118
|
+
|
119
|
+
def mapper_key entry
|
120
|
+
{ source_type: entry.source_type, manifest_path: manifest.csv_path }
|
121
|
+
end
|
122
|
+
end # class Converter
|
123
|
+
end # module Converter
|
124
|
+
end # module DS
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Converter
|
5
|
+
class Writer
|
6
|
+
|
7
|
+
attr_reader :count
|
8
|
+
attr_reader :output
|
9
|
+
attr_reader :validator
|
10
|
+
|
11
|
+
|
12
|
+
def initializer output_io, row_validator
|
13
|
+
@output = output_io
|
14
|
+
@count = 1
|
15
|
+
@validator = row_validator
|
16
|
+
@valid = true
|
17
|
+
@errors = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def write
|
21
|
+
converter.convert do |row|
|
22
|
+
CSV.open output, "w", headers: true do |csv|
|
23
|
+
csv << DS::HEADINGS if count == 1
|
24
|
+
count += 1
|
25
|
+
validate_row count, row
|
26
|
+
csv << row
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def valid?
|
32
|
+
errors.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def validate_row row_num, row
|
36
|
+
error = validator.row_valid? row
|
37
|
+
return unless row
|
38
|
+
add_error row_num, error
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_error row_num, error
|
42
|
+
@errors << [row_num, error]
|
43
|
+
end
|
44
|
+
|
45
|
+
def errors
|
46
|
+
@errors.dup
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/lib/ds/converter.rb
ADDED
data/lib/ds/csv_util.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module CSVUtil
|
5
|
+
module ClassMethods
|
6
|
+
# TODO: These methods don't belong in CSVUtil; find them a new home
|
7
|
+
# TODO: Remove CSVUtil when the above TODO is complete
|
8
|
+
# Columns with two levels of subfields, separated by '|' and ';'
|
9
|
+
NESTED_COLUMNS = %w{ subject subject_label genre genre_label production_place production_place_label language language_label }
|
10
|
+
##
|
11
|
+
# Check all rows for validation errors, including:
|
12
|
+
#
|
13
|
+
# - trailing spaces in values
|
14
|
+
#
|
15
|
+
# @param [Array<Hash>] rows the CSV rows
|
16
|
+
# @return [Boolean]
|
17
|
+
def validate rows
|
18
|
+
valid = true
|
19
|
+
rows.each_with_index do |row,index|
|
20
|
+
valid = false unless row_valid? row, index
|
21
|
+
end
|
22
|
+
valid
|
23
|
+
end
|
24
|
+
|
25
|
+
# split on pipes that are not escaped with '\'
|
26
|
+
PIPE_SPLIT_REGEXP = %r{(?<!\\)\|}
|
27
|
+
# split on pipes and semicolons that are not escaped with '\'
|
28
|
+
PIPE_SEMICOLON_REGEXP = %r{(?<!\\)[;|]}
|
29
|
+
|
30
|
+
|
31
|
+
def row_valid? row, index
|
32
|
+
valid = true
|
33
|
+
DS::Util::CsvValidator.validate_whitespace(row, row_num: index, nested_columns: NESTED_COLUMNS).each do |error|
|
34
|
+
valid = false
|
35
|
+
STDERR.puts "WARNING: #{error}"
|
36
|
+
end
|
37
|
+
valid
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
self.extend ClassMethods
|
42
|
+
end
|
43
|
+
end
|