ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Mapper
|
5
|
+
class DSMetsMapper < DS::Mapper::BaseMapper
|
6
|
+
attr_reader :iiif_lookup
|
7
|
+
attr_reader :ia_url_lookup
|
8
|
+
|
9
|
+
|
10
|
+
def initialize(source_dir:, timestamp:)
|
11
|
+
super(
|
12
|
+
source_dir: source_dir,
|
13
|
+
timestamp: timestamp,
|
14
|
+
source: DS::Source::DSMetsXML.new
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def extract_record entry
|
19
|
+
locator = DS::Extractor::XmlRecordLocator.new
|
20
|
+
source_file_path = File.join source_dir, entry.filename
|
21
|
+
xml = source.load_source source_file_path
|
22
|
+
|
23
|
+
record = locator.locate_record xml, entry.institutional_id, entry.institutional_id_location_in_source
|
24
|
+
return record if record.present?
|
25
|
+
|
26
|
+
raise "Unable to locate record for #{entry.institutional_id} (errors: #{locator.errors.join(', ')})"
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# @param [DS::Manifest::Entry] entry entry instance for a manifest row
|
31
|
+
# @return [Hash] the mapped record
|
32
|
+
def map_record entry
|
33
|
+
record = extract_record entry
|
34
|
+
|
35
|
+
source_type = entry.source_type
|
36
|
+
source_file = entry.filename
|
37
|
+
ds_id = entry.ds_id
|
38
|
+
date_added = nil
|
39
|
+
date_last_updated = nil
|
40
|
+
cataloging_convention = DS::Extractor::DsMetsXmlExtractor.extract_cataloging_convention(record)
|
41
|
+
holding_institution_ds_qid = entry.institution_ds_qid
|
42
|
+
holding_institution_as_recorded = entry.institution_wikidata_label
|
43
|
+
holding_institution_id_number = entry.institutional_id
|
44
|
+
holding_institution_shelfmark = entry.call_number
|
45
|
+
link_to_holding_institution_record = entry.link_to_institutional_record
|
46
|
+
iiif_manifest = entry.iiif_manifest_url
|
47
|
+
production_date_as_recorded = DS::Extractor::DsMetsXmlExtractor.extract_production_date_as_recorded(record).join '|'
|
48
|
+
production_date = DS::Extractor::DsMetsXmlExtractor.extract_date_range(record, range_sep: '^').join '|'
|
49
|
+
century = DS.transform_dates_to_centuries production_date
|
50
|
+
century_aat = DS.transform_centuries_to_aat century
|
51
|
+
dated = DS::Extractor::DsMetsXmlExtractor.dated_by_scribe? record
|
52
|
+
physical_description = DS::Extractor::DsMetsXmlExtractor.extract_physical_description(record).join '|'
|
53
|
+
note = DS::Extractor::DsMetsXmlExtractor.extract_notes(record).join '|'
|
54
|
+
acknowledgments = DS::Extractor::DsMetsXmlExtractor.extract_acknowledgments(record).join '|'
|
55
|
+
data_processed_at = timestamp
|
56
|
+
data_source_modified = entry.record_last_updated
|
57
|
+
|
58
|
+
{
|
59
|
+
ds_id: ds_id,
|
60
|
+
date_added: date_added,
|
61
|
+
date_last_updated: date_last_updated,
|
62
|
+
dated: dated,
|
63
|
+
cataloging_convention: cataloging_convention,
|
64
|
+
source_type: source_type,
|
65
|
+
holding_institution_ds_qid: holding_institution_ds_qid,
|
66
|
+
holding_institution_as_recorded: holding_institution_as_recorded,
|
67
|
+
holding_institution_id_number: holding_institution_id_number,
|
68
|
+
holding_institution_shelfmark: holding_institution_shelfmark,
|
69
|
+
link_to_holding_institution_record: link_to_holding_institution_record,
|
70
|
+
iiif_manifest: iiif_manifest,
|
71
|
+
production_date_as_recorded: production_date_as_recorded,
|
72
|
+
production_date: production_date,
|
73
|
+
century: century,
|
74
|
+
century_aat: century_aat,
|
75
|
+
physical_description: physical_description,
|
76
|
+
note: note,
|
77
|
+
acknowledgments: acknowledgments,
|
78
|
+
data_processed_at: data_processed_at,
|
79
|
+
data_source_modified: data_source_modified,
|
80
|
+
source_file: source_file,
|
81
|
+
}.update build_term_maps DS::Extractor::DsMetsXmlExtractor, record
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Mapper
|
5
|
+
|
6
|
+
class MarcMapper < DS::Mapper::BaseMapper
|
7
|
+
|
8
|
+
def initialize(source_dir:, timestamp:)
|
9
|
+
super(
|
10
|
+
source_dir: source_dir,
|
11
|
+
timestamp: timestamp,
|
12
|
+
source: DS::Source::MarcXML.new
|
13
|
+
)
|
14
|
+
end
|
15
|
+
##
|
16
|
+
# @param [DS::Manifest::Entry] entry +entry+ representing one
|
17
|
+
# row in a manifest
|
18
|
+
def extract_record entry
|
19
|
+
record_locator = DS::Extractor::XmlRecordLocator.new(
|
20
|
+
namespaces: DS::Constants::XML_NAMESPACES
|
21
|
+
)
|
22
|
+
|
23
|
+
source_file_path = File.join source_dir, entry.filename
|
24
|
+
xml = source.load_source source_file_path
|
25
|
+
xpath = entry.institutional_id_location_in_source.gsub('ID_PLACEHOLDER', entry.institutional_id) # "//record[#{entry.institutional_id_location_in_source} = '#{entry.institutional_id}']"
|
26
|
+
record = record_locator.locate_record(xml, entry.institutional_id, xpath).first
|
27
|
+
return record if record.present?
|
28
|
+
|
29
|
+
raise "Unable to locate record for #{entry.institutional_id} (errors: #{record_locator.errors.join(', ')})"
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# @param [DS::Manifest::Entry] entry entry instance for a manifest row
|
34
|
+
# @return [Hash] the mapped record
|
35
|
+
def map_record entry
|
36
|
+
record = extract_record entry
|
37
|
+
source_type = 'marc-xml'
|
38
|
+
source_file = entry.filename
|
39
|
+
ds_id = entry.ds_id
|
40
|
+
date_added = ''
|
41
|
+
date_last_updated = ''
|
42
|
+
dated = entry.dated?
|
43
|
+
cataloging_convention = DS::Extractor::MarcXmlExtractor.extract_cataloging_convention record
|
44
|
+
holding_institution_ds_qid = entry.institution_ds_qid
|
45
|
+
holding_institution_as_recorded = entry.institution_wikidata_label
|
46
|
+
holding_institution_id_number = entry.institutional_id
|
47
|
+
holding_institution_shelfmark = entry.call_number
|
48
|
+
link_to_holding_institution_record = entry.link_to_institutional_record
|
49
|
+
iiif_manifest = entry.iiif_manifest_url
|
50
|
+
production_date_as_recorded = DS::Extractor::MarcXmlExtractor.extract_production_date_as_recorded(record).join '|'
|
51
|
+
production_date = DS::Extractor::MarcXmlExtractor.extract_date_range(record, range_sep: '^').join '|'
|
52
|
+
century = DS.transform_dates_to_centuries production_date
|
53
|
+
century_aat = DS.transform_centuries_to_aat century
|
54
|
+
physical_description = DS::Extractor::MarcXmlExtractor.extract_physical_description(record).join('|')
|
55
|
+
note = DS::Extractor::MarcXmlExtractor.extract_notes(record).join '|'
|
56
|
+
data_processed_at = timestamp
|
57
|
+
data_source_modified = entry.record_last_updated
|
58
|
+
acknowledgments = ''
|
59
|
+
|
60
|
+
{
|
61
|
+
ds_id: ds_id,
|
62
|
+
date_added: date_added,
|
63
|
+
date_last_updated: date_last_updated,
|
64
|
+
dated: dated,
|
65
|
+
source_type: source_type,
|
66
|
+
cataloging_convention: cataloging_convention,
|
67
|
+
holding_institution_ds_qid: holding_institution_ds_qid,
|
68
|
+
holding_institution_as_recorded: holding_institution_as_recorded,
|
69
|
+
holding_institution_id_number: holding_institution_id_number,
|
70
|
+
holding_institution_shelfmark: holding_institution_shelfmark,
|
71
|
+
link_to_holding_institution_record: link_to_holding_institution_record,
|
72
|
+
iiif_manifest: iiif_manifest,
|
73
|
+
production_date: production_date,
|
74
|
+
century: century,
|
75
|
+
century_aat: century_aat,
|
76
|
+
production_date_as_recorded: production_date_as_recorded,
|
77
|
+
physical_description: physical_description,
|
78
|
+
note: note,
|
79
|
+
data_processed_at: data_processed_at,
|
80
|
+
data_source_modified: data_source_modified,
|
81
|
+
source_file: source_file,
|
82
|
+
acknowledgments: acknowledgments,
|
83
|
+
}.update build_term_maps DS::Extractor::MarcXmlExtractor, record
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Mapper
|
5
|
+
class TeiXmlMapper < BaseMapper
|
6
|
+
|
7
|
+
def initialize source_dir:, timestamp:
|
8
|
+
super(
|
9
|
+
source_dir: source_dir,
|
10
|
+
timestamp: timestamp,
|
11
|
+
source: DS::Source::TeiXML.new
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
def extract_record entry
|
16
|
+
locator = DS::Extractor::XmlRecordLocator.new
|
17
|
+
source_file_path = File.join source_dir, entry.filename
|
18
|
+
xml = source.load_source source_file_path
|
19
|
+
record = locator.locate_record xml, entry.institutional_id, entry.institutional_id_location_in_source
|
20
|
+
return record if record.present?
|
21
|
+
|
22
|
+
raise "Unable to locate record for #{entry.institutional_id} (errors: #{record_locator.errors.join(', ')})"
|
23
|
+
end
|
24
|
+
|
25
|
+
def map_record entry
|
26
|
+
record = extract_record entry
|
27
|
+
source_type = 'tei-xml'
|
28
|
+
ds_id = entry.ds_id
|
29
|
+
date_added = ''
|
30
|
+
date_last_updated = ''
|
31
|
+
cataloging_convention = DS::Extractor::TeiXml.extract_cataloging_convention(record)
|
32
|
+
dated = entry.dated?
|
33
|
+
holding_institution_ds_qid = entry.institution_ds_qid
|
34
|
+
holding_institution_as_recorded = entry.institution_wikidata_label
|
35
|
+
holding_institution_id_number = entry.institutional_id
|
36
|
+
holding_institution_shelfmark = entry.call_number
|
37
|
+
link_to_holding_institution_record = entry.link_to_institutional_record
|
38
|
+
iiif_manifest = entry.iiif_manifest_url
|
39
|
+
production_date_as_recorded = DS::Extractor::TeiXml.extract_production_date_as_recorded(record, range_sep: '-').join('|')
|
40
|
+
production_date = DS::Extractor::TeiXml.extract_date_range(record, range_sep: '^').join('|')
|
41
|
+
century = DS.transform_dates_to_centuries production_date
|
42
|
+
century_aat = DS.transform_centuries_to_aat century
|
43
|
+
acknowledgments = DS::Extractor::TeiXml.extract_acknowledgments(record).join '|'
|
44
|
+
physical_description = DS::Extractor::TeiXml.extract_physical_description(record).join '|'
|
45
|
+
note = DS::Extractor::TeiXml.extract_notes(record).join '|'
|
46
|
+
data_processed_at = timestamp
|
47
|
+
data_source_modified = entry.record_last_updated
|
48
|
+
|
49
|
+
|
50
|
+
# TODO: BiblioPhilly MSS have keywords (not subjects, genre); include them?
|
51
|
+
|
52
|
+
{
|
53
|
+
ds_id: ds_id,
|
54
|
+
date_added: date_added,
|
55
|
+
date_last_updated: date_last_updated,
|
56
|
+
dated: dated,
|
57
|
+
cataloging_convention: cataloging_convention,
|
58
|
+
source_type: source_type,
|
59
|
+
holding_institution_ds_qid: holding_institution_ds_qid,
|
60
|
+
holding_institution_as_recorded: holding_institution_as_recorded,
|
61
|
+
holding_institution_id_number: holding_institution_id_number,
|
62
|
+
holding_institution_shelfmark: holding_institution_shelfmark,
|
63
|
+
link_to_holding_institution_record: link_to_holding_institution_record,
|
64
|
+
iiif_manifest: iiif_manifest,
|
65
|
+
production_date_as_recorded: production_date_as_recorded,
|
66
|
+
production_date: production_date,
|
67
|
+
century: century,
|
68
|
+
century_aat: century_aat,
|
69
|
+
physical_description: physical_description,
|
70
|
+
acknowledgments: acknowledgments,
|
71
|
+
note: note,
|
72
|
+
data_processed_at: data_processed_at,
|
73
|
+
data_source_modified: data_source_modified,
|
74
|
+
source_file: entry.filename
|
75
|
+
}.update build_term_maps DS::Extractor::TeiXml, record
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/ds/mapper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require_relative 'mapper/base_mapper'
|
3
|
+
require_relative 'mapper/ds_csv_mapper'
|
4
|
+
require_relative 'mapper/ds_mets_mapper'
|
5
|
+
require_relative 'mapper/marc_mapper'
|
6
|
+
require_relative 'mapper/tei_xml_mapper'
|
7
|
+
|
8
|
+
module DS
|
9
|
+
# The DS mapper namespace contains classes and methods for mapping
|
10
|
+
# DS data records DS imprort CSVs.
|
11
|
+
module Mapper
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Recon
|
4
|
+
module Constants
|
5
|
+
GENRE_HEADERS = %w{
|
6
|
+
genre_as_recorded
|
7
|
+
vocab
|
8
|
+
source_authority_uri
|
9
|
+
authorized_label
|
10
|
+
structured_value
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
LANGUAGES_HEADERS = %w{
|
14
|
+
language_as_recorded
|
15
|
+
language_code
|
16
|
+
authorized_label
|
17
|
+
structured_value
|
18
|
+
}.freeze
|
19
|
+
|
20
|
+
MATERIALS_HEADERS = %w{
|
21
|
+
material_as_recorded authorized_label structured_value
|
22
|
+
}.freeze
|
23
|
+
|
24
|
+
NAMES_HEADERS = %w{
|
25
|
+
name_as_recorded
|
26
|
+
role
|
27
|
+
name_agr
|
28
|
+
source_authority_uri
|
29
|
+
instance_of
|
30
|
+
authorized_label
|
31
|
+
structured_value
|
32
|
+
}.freeze
|
33
|
+
|
34
|
+
PLACES_HEADERS = %w{
|
35
|
+
place_as_recorded authorized_label structured_value
|
36
|
+
}.freeze
|
37
|
+
|
38
|
+
SUBJECT_HEADERS = %w{
|
39
|
+
subject_as_recorded
|
40
|
+
subfield_codes
|
41
|
+
vocab
|
42
|
+
source_authority_uri
|
43
|
+
authorized_label
|
44
|
+
structured_value
|
45
|
+
}.freeze
|
46
|
+
|
47
|
+
TITLE_HEADERS = %w{
|
48
|
+
title_as_recorded
|
49
|
+
title_as_recorded_agr
|
50
|
+
uniform_title_as_recorded
|
51
|
+
uniform_title_as_recorded_agr
|
52
|
+
authorized_label
|
53
|
+
}.freeze
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Recon
|
4
|
+
class DsCsvEnumerator < SourceEnumerator
|
5
|
+
|
6
|
+
# Iterates over each row in the CSV files and yields it to the provided block.
|
7
|
+
# @yield [row] yields each row in the CSV file
|
8
|
+
def each &block
|
9
|
+
files.each do |file|
|
10
|
+
CSV.foreach file, headers: true do |row|
|
11
|
+
yield row
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Recon
|
4
|
+
class DsMetsXmlEnumerator < SourceEnumerator
|
5
|
+
|
6
|
+
# Iterates over each row in the CSV files and yields it to the provided block.
|
7
|
+
# @yield [row] yields the parsed METS XML record
|
8
|
+
def each &block
|
9
|
+
process_xml files do |record|
|
10
|
+
yield record
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Recon
|
4
|
+
class MarcXmlEnumerator < SourceEnumerator
|
5
|
+
|
6
|
+
def each &block
|
7
|
+
process_xml files, remove_namespaces: true do |xml|
|
8
|
+
xml.xpath('//record').each do |record|
|
9
|
+
yield record
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require_relative 'recon_config'
|
4
|
+
require_relative 'type/recon_type'
|
5
|
+
|
6
|
+
module Recon
|
7
|
+
|
8
|
+
##
|
9
|
+
# A class to build recon CSV rows from DS data sources.#
|
10
|
+
|
11
|
+
class ReconBuilder
|
12
|
+
attr_reader :source_type
|
13
|
+
attr_reader :files
|
14
|
+
attr_reader :out_dir
|
15
|
+
|
16
|
+
# A hash mapping DS data source types to their corresponding enumerator classes
|
17
|
+
#
|
18
|
+
# Keys:
|
19
|
+
# - DS::Constants::DS_CSV
|
20
|
+
# - DS::Constants::MARC_XML
|
21
|
+
# - DS::Constants::TEI_XML
|
22
|
+
# - DS::Constants::DS_METS
|
23
|
+
#
|
24
|
+
# Values:
|
25
|
+
# - Recon::DsCsvEnumerator
|
26
|
+
# - Recon::MarcXmlEnumerator
|
27
|
+
# - Recon::TeiXmlEnumerator
|
28
|
+
# - Recon::DsMetsXmlEnumerator
|
29
|
+
SOURCE_TYPE_ENUMERATORS = {
|
30
|
+
DS::Constants::DS_CSV => Recon::DsCsvEnumerator,
|
31
|
+
DS::Constants::MARC_XML => Recon::MarcXmlEnumerator,
|
32
|
+
DS::Constants::TEI_XML => Recon::TeiXmlEnumerator,
|
33
|
+
DS::Constants::DS_METS => Recon::DsMetsXmlEnumerator,
|
34
|
+
}
|
35
|
+
|
36
|
+
# A hash mapping DS data source types to their corresponding extractor classes
|
37
|
+
#
|
38
|
+
# Keys:
|
39
|
+
# - DS::Constants::DS_CSV
|
40
|
+
# - DS::Constants::MARC_XML
|
41
|
+
# - DS::Constants::TEI_XML
|
42
|
+
# - DS::Constants::DS_METS
|
43
|
+
#
|
44
|
+
# Values:
|
45
|
+
# - DS::Extractor::DsCsv
|
46
|
+
# - DS::Extractor::MarcXml
|
47
|
+
# - DS::Extractor::TeiXml
|
48
|
+
# - DS::Extractor::DsMetsXml
|
49
|
+
SOURCE_TYPE_EXTRACTORS = {
|
50
|
+
DS::Constants::MARC_XML => DS::Extractor::MarcXmlExtractor,
|
51
|
+
DS::Constants::DS_CSV => DS::Extractor::DsCsvExtractor,
|
52
|
+
DS::Constants::DS_METS => DS::Extractor::DsMetsXmlExtractor,
|
53
|
+
DS::Constants::TEI_XML => DS::Extractor::TeiXml
|
54
|
+
}
|
55
|
+
|
56
|
+
# @param [Symbol] source_type a valid DS data source type; e.g., DS::Constants::MARC_XML
|
57
|
+
# @param [Array] files an array of source file paths;e.g., +marc1.xml+, +marc2.xml+, etc.
|
58
|
+
# @param [String] out_dir a path to an output directory
|
59
|
+
def initialize source_type:, files:, out_dir:
|
60
|
+
@source_type = source_type
|
61
|
+
@files = files
|
62
|
+
@out_dir = out_dir
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Recon::SourceEnumerator] an source enumerator type; e.g., Recon::MarcXmlEnumerator
|
66
|
+
def enumerator
|
67
|
+
return @enumerator if @enumerator.present?
|
68
|
+
klass = SOURCE_TYPE_ENUMERATORS[source_type]
|
69
|
+
raise "Unknown source type: #{source_type}" unless klass
|
70
|
+
@enumerator = klass.new files
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [DS::Extractor::MarcXml,DS::Extractor::DsCsvExtractor,DS::Extractor::DsMetsXml,DS::Extractor::TeiXml] an extractor type; e.g., DS::Extractor::MarcXml
|
74
|
+
def extractor
|
75
|
+
@extractor ||= SOURCE_TYPE_EXTRACTORS[source_type]
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# For each extracted term in of the given set type (like :places)
|
80
|
+
# yield the corresponding recon CSV row.
|
81
|
+
#
|
82
|
+
# Example:
|
83
|
+
#
|
84
|
+
# # for a ReconBuilder for set of TEI files
|
85
|
+
# CSV headers: true do |csv|
|
86
|
+
# csv << Recon::Type::Places.csv_headers
|
87
|
+
# recon_builder.each_recon(:places) do |recon|
|
88
|
+
# csv << recon
|
89
|
+
# end
|
90
|
+
# end
|
91
|
+
#
|
92
|
+
# @param [Symbol] set_name a recon set name, like :places
|
93
|
+
# @yield [Hash<Symbol, String>] a block that yields recon rows
|
94
|
+
def each_recon set_name, &block
|
95
|
+
items = Set.new
|
96
|
+
recon_type = Recon.find_recon_type set_name
|
97
|
+
|
98
|
+
enumerator.each do |record|
|
99
|
+
[recon_type.method_name].flatten.each do |name|
|
100
|
+
next unless extractor.respond_to? name.to_sym
|
101
|
+
extractor.send(name.to_sym, record).each do |item|
|
102
|
+
next if items.include? item
|
103
|
+
items << item
|
104
|
+
yield build_recon recon_type: recon_type, item: item
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Find a recon type configuration by name
|
111
|
+
#
|
112
|
+
# @param [String] name the name of the recon type to find
|
113
|
+
# @return [Recon::Type::ReconType, nil] the recon type configuration if found, nil otherwise
|
114
|
+
def find_recon_type name
|
115
|
+
Recon::RECON_TYPES.find { |config| config.set_name == name.to_s }
|
116
|
+
end
|
117
|
+
|
118
|
+
# A function that replaces delimiters in a value based on a given
|
119
|
+
# delimiter map.
|
120
|
+
#
|
121
|
+
# @param value [String] the value to be processed
|
122
|
+
# @param delimiter_map [Hash] a hash containing the old and new
|
123
|
+
# delimiters, e.g., for <tt>{ "|" => ";" }</tt> all +|+s will be
|
124
|
+
# replaced with +;+s
|
125
|
+
# @return [String] the processed value with delimiters replaced
|
126
|
+
def fix_delimiters value, delimiter_map = {}
|
127
|
+
return value if delimiter_map.blank?
|
128
|
+
val = ''
|
129
|
+
delimiter_map.each { |old, new| val = value.to_s.gsub old, new }
|
130
|
+
val
|
131
|
+
end
|
132
|
+
|
133
|
+
# Builds an array of recon CSV rows for each term in the given terms array
|
134
|
+
# using the given recon type configuration.
|
135
|
+
#
|
136
|
+
# @param terms [Array<DS::Extractor::BaseTerm>] an array of terms to build recon rows for
|
137
|
+
# @param recon_type [Recon::Type::ReconType] a recon type configuration
|
138
|
+
# @return [Array<Hash<Symbol,String>>] an array of recon CSV rows
|
139
|
+
def build_all_recons terms, recon_type
|
140
|
+
terms.map { |term| build_recon item: term, recon_type: recon_type }
|
141
|
+
end
|
142
|
+
|
143
|
+
# Build a single recon CSV row; e.g.,
|
144
|
+
#
|
145
|
+
# { :language_as_recorded => "Arabic",
|
146
|
+
# :language_code => "",
|
147
|
+
# :authorized_label => "Arabic",
|
148
|
+
# :structured_value => "Q13955" }
|
149
|
+
#
|
150
|
+
# @param [DS::Extractor::BaseTerm] item a term like a DS::Extractor::Place
|
151
|
+
# @param [Recon::Type::ReconType] recon_type a recon type configuration like Recon::Type::Places
|
152
|
+
# @return [Hash<Symbol,String>] a recon CSV row
|
153
|
+
def build_recon item:, recon_type:
|
154
|
+
recon_hash = item.to_h
|
155
|
+
key_values = recon_type.get_key_values recon_hash
|
156
|
+
recon_type.lookup_columns.each do |col|
|
157
|
+
val = Recon.lookup_single(recon_type.set_name, key_values: key_values, column: col)
|
158
|
+
recon_hash[col] = fix_delimiters val, recon_type.delimiter_map
|
159
|
+
end
|
160
|
+
recon_hash
|
161
|
+
end
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
##
|
166
|
+
# Transform the recon hash
|
167
|
+
#
|
168
|
+
# Currently, we just replace the +as_recorded+ header with the type-specific
|
169
|
+
# +ReconType#as_recorded_column+, e.g., +language_as_recorded+
|
170
|
+
#
|
171
|
+
#
|
172
|
+
# @param recon_hash [Hash<Symbol,Object>] a hash of the CSV row
|
173
|
+
# @param recon_type [Recon::Type::ReconType] a ReconType like, ReconPlaces
|
174
|
+
# @return [Hash<Symbol,Object>]
|
175
|
+
def prep_row recon_hash:, recon_type:
|
176
|
+
row = recon_hash.dup
|
177
|
+
ar_value = row.delete :as_recorded
|
178
|
+
row[recon_type.as_recorded_column] = ar_value
|
179
|
+
row
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'git'
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
module Recon
|
5
|
+
module ReconData
|
6
|
+
def self.update!
|
7
|
+
repo_name = Settings.recon.git_local_name
|
8
|
+
url = Settings.recon.git_repo
|
9
|
+
branch = Settings.recon.git_branch || 'main'
|
10
|
+
logger = DS.logger
|
11
|
+
|
12
|
+
Dir.chdir local_dir do
|
13
|
+
unless File.exist? repo_name
|
14
|
+
puts Git.clone url, repo_name, branch: branch, remote: 'origin', log: logger
|
15
|
+
end
|
16
|
+
g = Git.open repo_name, log: logger
|
17
|
+
begin
|
18
|
+
puts g.fetch 'origin'
|
19
|
+
puts g.checkout branch
|
20
|
+
puts g.pull 'origin', branch
|
21
|
+
rescue Git::GitExecuteError => e
|
22
|
+
logger.warn { "Error executing git command" }
|
23
|
+
logger.warn { e.message }
|
24
|
+
STDERR.puts e.backtrace if ENV['DS_VERBOSE']
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.local_dir
|
30
|
+
Settings.recon.local_dir
|
31
|
+
end
|
32
|
+
def self.repo_dir
|
33
|
+
File.join DS.root, Settings.recon.git_local_name
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Recon
|
4
|
+
class ReconManager
|
5
|
+
|
6
|
+
attr_reader :out_dir
|
7
|
+
attr_reader :source_type
|
8
|
+
attr_reader :files
|
9
|
+
|
10
|
+
# Initialize the ReconManager.
|
11
|
+
#
|
12
|
+
# @param source_type [Symbol] a valid DS data source type; e.g., DS::Constants::MARC_XML
|
13
|
+
# @param out_dir [String] the output directory path
|
14
|
+
# @param files [Array<String>] an array of source file paths; e.g., +marc1.xml+, +marc2.xml+, etc.
|
15
|
+
# @return [void]
|
16
|
+
def initialize source_type:, out_dir:, files:
|
17
|
+
@source_type = source_type
|
18
|
+
@out_dir = out_dir
|
19
|
+
@files = files
|
20
|
+
@errors = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
# Write all recon CSV files.
|
24
|
+
#
|
25
|
+
# @return [Array<String>] the list of output files
|
26
|
+
def write_all_csvs
|
27
|
+
outfiles = []
|
28
|
+
Recon::RECON_TYPES.each do |recon_type|
|
29
|
+
outfiles << write_csv(recon_type)
|
30
|
+
end
|
31
|
+
outfiles
|
32
|
+
end
|
33
|
+
|
34
|
+
# Write a CSV file for a specific recon type.
|
35
|
+
#
|
36
|
+
# @param recon_type [Recon::Type::ReconType] the type of reconciliation data
|
37
|
+
# @return [String] the path to the output CSV file
|
38
|
+
def write_csv recon_type
|
39
|
+
outfile = File.join out_dir, "#{recon_type.set_name}.csv"
|
40
|
+
CSV.open(outfile, 'w+', headers: true) do |csv|
|
41
|
+
row_num = 0
|
42
|
+
csv << recon_type.recon_csv_headers
|
43
|
+
recon_builder.each_recon(recon_type.set_name) do |recon|
|
44
|
+
errors = Recon.validate_row(recon_type, recon, row_num: row_num += 1)
|
45
|
+
add_errors recon_type, errors unless errors.blank?
|
46
|
+
csv << recon
|
47
|
+
end
|
48
|
+
end
|
49
|
+
if has_errors?(recon_type)
|
50
|
+
raise DSError, "Error writing #{outfile}:\n#{errors_for_type(recon_type).join("\n")}"
|
51
|
+
end
|
52
|
+
outfile
|
53
|
+
end
|
54
|
+
|
55
|
+
# Initializes and returns a new instance of the Recon::ReconBuilder class with the specified output directory, source type, and files.
|
56
|
+
#
|
57
|
+
# @return [Recon::ReconBuilder] A new instance of the Recon::ReconBuilder class.
|
58
|
+
def recon_builder
|
59
|
+
@recon_builder ||= Recon::ReconBuilder.new(
|
60
|
+
out_dir: out_dir, source_type: source_type, files: files
|
61
|
+
)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Adds errors to the specified recon type.
|
65
|
+
#
|
66
|
+
# @param recon_type [ReconType] the recon type to add errors to
|
67
|
+
# @param messages [Array<String>] the errors to add
|
68
|
+
# @return [void]
|
69
|
+
def add_errors recon_type, messages
|
70
|
+
@errors[recon_type.set_name] ||= []
|
71
|
+
@errors[recon_type.set_name] += messages
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns true if errors exist for the specified recon type.
|
76
|
+
#
|
77
|
+
# @param recon_type [ReconType] the recon type
|
78
|
+
# @return [Boolean] true if errors exist
|
79
|
+
def has_errors? recon_type
|
80
|
+
errors_for_type(recon_type).present?
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the list of errors for the specified recon type.
|
84
|
+
#
|
85
|
+
# @param recon_type [ReconType] the recon type
|
86
|
+
# @return [Array<String>] the list of errors
|
87
|
+
def errors_for_type recon_type
|
88
|
+
@errors[recon_type.set_name]
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|