ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Mapper
5
+ class DSMetsMapper < DS::Mapper::BaseMapper
6
+ attr_reader :iiif_lookup
7
+ attr_reader :ia_url_lookup
8
+
9
+
10
+ def initialize(source_dir:, timestamp:)
11
+ super(
12
+ source_dir: source_dir,
13
+ timestamp: timestamp,
14
+ source: DS::Source::DSMetsXML.new
15
+ )
16
+ end
17
+
18
+ def extract_record entry
19
+ locator = DS::Extractor::XmlRecordLocator.new
20
+ source_file_path = File.join source_dir, entry.filename
21
+ xml = source.load_source source_file_path
22
+
23
+ record = locator.locate_record xml, entry.institutional_id, entry.institutional_id_location_in_source
24
+ return record if record.present?
25
+
26
+ raise "Unable to locate record for #{entry.institutional_id} (errors: #{locator.errors.join(', ')})"
27
+ end
28
+
29
+ ##
30
+ # @param [DS::Manifest::Entry] entry entry instance for a manifest row
31
+ # @return [Hash] the mapped record
32
+ def map_record entry
33
+ record = extract_record entry
34
+
35
+ source_type = entry.source_type
36
+ source_file = entry.filename
37
+ ds_id = entry.ds_id
38
+ date_added = nil
39
+ date_last_updated = nil
40
+ cataloging_convention = DS::Extractor::DsMetsXmlExtractor.extract_cataloging_convention(record)
41
+ holding_institution_ds_qid = entry.institution_ds_qid
42
+ holding_institution_as_recorded = entry.institution_wikidata_label
43
+ holding_institution_id_number = entry.institutional_id
44
+ holding_institution_shelfmark = entry.call_number
45
+ link_to_holding_institution_record = entry.link_to_institutional_record
46
+ iiif_manifest = entry.iiif_manifest_url
47
+ production_date_as_recorded = DS::Extractor::DsMetsXmlExtractor.extract_production_date_as_recorded(record).join '|'
48
+ production_date = DS::Extractor::DsMetsXmlExtractor.extract_date_range(record, range_sep: '^').join '|'
49
+ century = DS.transform_dates_to_centuries production_date
50
+ century_aat = DS.transform_centuries_to_aat century
51
+ dated = DS::Extractor::DsMetsXmlExtractor.dated_by_scribe? record
52
+ physical_description = DS::Extractor::DsMetsXmlExtractor.extract_physical_description(record).join '|'
53
+ note = DS::Extractor::DsMetsXmlExtractor.extract_notes(record).join '|'
54
+ acknowledgments = DS::Extractor::DsMetsXmlExtractor.extract_acknowledgments(record).join '|'
55
+ data_processed_at = timestamp
56
+ data_source_modified = entry.record_last_updated
57
+
58
+ {
59
+ ds_id: ds_id,
60
+ date_added: date_added,
61
+ date_last_updated: date_last_updated,
62
+ dated: dated,
63
+ cataloging_convention: cataloging_convention,
64
+ source_type: source_type,
65
+ holding_institution_ds_qid: holding_institution_ds_qid,
66
+ holding_institution_as_recorded: holding_institution_as_recorded,
67
+ holding_institution_id_number: holding_institution_id_number,
68
+ holding_institution_shelfmark: holding_institution_shelfmark,
69
+ link_to_holding_institution_record: link_to_holding_institution_record,
70
+ iiif_manifest: iiif_manifest,
71
+ production_date_as_recorded: production_date_as_recorded,
72
+ production_date: production_date,
73
+ century: century,
74
+ century_aat: century_aat,
75
+ physical_description: physical_description,
76
+ note: note,
77
+ acknowledgments: acknowledgments,
78
+ data_processed_at: data_processed_at,
79
+ data_source_modified: data_source_modified,
80
+ source_file: source_file,
81
+ }.update build_term_maps DS::Extractor::DsMetsXmlExtractor, record
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Mapper
5
+
6
+ class MarcMapper < DS::Mapper::BaseMapper
7
+
8
+ def initialize(source_dir:, timestamp:)
9
+ super(
10
+ source_dir: source_dir,
11
+ timestamp: timestamp,
12
+ source: DS::Source::MarcXML.new
13
+ )
14
+ end
15
+ ##
16
+ # @param [DS::Manifest::Entry] entry +entry+ representing one
17
+ # row in a manifest
18
+ def extract_record entry
19
+ record_locator = DS::Extractor::XmlRecordLocator.new(
20
+ namespaces: DS::Constants::XML_NAMESPACES
21
+ )
22
+
23
+ source_file_path = File.join source_dir, entry.filename
24
+ xml = source.load_source source_file_path
25
+ xpath = entry.institutional_id_location_in_source.gsub('ID_PLACEHOLDER', entry.institutional_id) # "//record[#{entry.institutional_id_location_in_source} = '#{entry.institutional_id}']"
26
+ record = record_locator.locate_record(xml, entry.institutional_id, xpath).first
27
+ return record if record.present?
28
+
29
+ raise "Unable to locate record for #{entry.institutional_id} (errors: #{record_locator.errors.join(', ')})"
30
+ end
31
+
32
+ ##
33
+ # @param [DS::Manifest::Entry] entry entry instance for a manifest row
34
+ # @return [Hash] the mapped record
35
+ def map_record entry
36
+ record = extract_record entry
37
+ source_type = 'marc-xml'
38
+ source_file = entry.filename
39
+ ds_id = entry.ds_id
40
+ date_added = ''
41
+ date_last_updated = ''
42
+ dated = entry.dated?
43
+ cataloging_convention = DS::Extractor::MarcXmlExtractor.extract_cataloging_convention record
44
+ holding_institution_ds_qid = entry.institution_ds_qid
45
+ holding_institution_as_recorded = entry.institution_wikidata_label
46
+ holding_institution_id_number = entry.institutional_id
47
+ holding_institution_shelfmark = entry.call_number
48
+ link_to_holding_institution_record = entry.link_to_institutional_record
49
+ iiif_manifest = entry.iiif_manifest_url
50
+ production_date_as_recorded = DS::Extractor::MarcXmlExtractor.extract_production_date_as_recorded(record).join '|'
51
+ production_date = DS::Extractor::MarcXmlExtractor.extract_date_range(record, range_sep: '^').join '|'
52
+ century = DS.transform_dates_to_centuries production_date
53
+ century_aat = DS.transform_centuries_to_aat century
54
+ physical_description = DS::Extractor::MarcXmlExtractor.extract_physical_description(record).join('|')
55
+ note = DS::Extractor::MarcXmlExtractor.extract_notes(record).join '|'
56
+ data_processed_at = timestamp
57
+ data_source_modified = entry.record_last_updated
58
+ acknowledgments = ''
59
+
60
+ {
61
+ ds_id: ds_id,
62
+ date_added: date_added,
63
+ date_last_updated: date_last_updated,
64
+ dated: dated,
65
+ source_type: source_type,
66
+ cataloging_convention: cataloging_convention,
67
+ holding_institution_ds_qid: holding_institution_ds_qid,
68
+ holding_institution_as_recorded: holding_institution_as_recorded,
69
+ holding_institution_id_number: holding_institution_id_number,
70
+ holding_institution_shelfmark: holding_institution_shelfmark,
71
+ link_to_holding_institution_record: link_to_holding_institution_record,
72
+ iiif_manifest: iiif_manifest,
73
+ production_date: production_date,
74
+ century: century,
75
+ century_aat: century_aat,
76
+ production_date_as_recorded: production_date_as_recorded,
77
+ physical_description: physical_description,
78
+ note: note,
79
+ data_processed_at: data_processed_at,
80
+ data_source_modified: data_source_modified,
81
+ source_file: source_file,
82
+ acknowledgments: acknowledgments,
83
+ }.update build_term_maps DS::Extractor::MarcXmlExtractor, record
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Mapper
5
+ class TeiXmlMapper < BaseMapper
6
+
7
+ def initialize source_dir:, timestamp:
8
+ super(
9
+ source_dir: source_dir,
10
+ timestamp: timestamp,
11
+ source: DS::Source::TeiXML.new
12
+ )
13
+ end
14
+
15
+ def extract_record entry
16
+ locator = DS::Extractor::XmlRecordLocator.new
17
+ source_file_path = File.join source_dir, entry.filename
18
+ xml = source.load_source source_file_path
19
+ record = locator.locate_record xml, entry.institutional_id, entry.institutional_id_location_in_source
20
+ return record if record.present?
21
+
22
+ raise "Unable to locate record for #{entry.institutional_id} (errors: #{record_locator.errors.join(', ')})"
23
+ end
24
+
25
+ def map_record entry
26
+ record = extract_record entry
27
+ source_type = 'tei-xml'
28
+ ds_id = entry.ds_id
29
+ date_added = ''
30
+ date_last_updated = ''
31
+ cataloging_convention = DS::Extractor::TeiXml.extract_cataloging_convention(record)
32
+ dated = entry.dated?
33
+ holding_institution_ds_qid = entry.institution_ds_qid
34
+ holding_institution_as_recorded = entry.institution_wikidata_label
35
+ holding_institution_id_number = entry.institutional_id
36
+ holding_institution_shelfmark = entry.call_number
37
+ link_to_holding_institution_record = entry.link_to_institutional_record
38
+ iiif_manifest = entry.iiif_manifest_url
39
+ production_date_as_recorded = DS::Extractor::TeiXml.extract_production_date_as_recorded(record, range_sep: '-').join('|')
40
+ production_date = DS::Extractor::TeiXml.extract_date_range(record, range_sep: '^').join('|')
41
+ century = DS.transform_dates_to_centuries production_date
42
+ century_aat = DS.transform_centuries_to_aat century
43
+ acknowledgments = DS::Extractor::TeiXml.extract_acknowledgments(record).join '|'
44
+ physical_description = DS::Extractor::TeiXml.extract_physical_description(record).join '|'
45
+ note = DS::Extractor::TeiXml.extract_notes(record).join '|'
46
+ data_processed_at = timestamp
47
+ data_source_modified = entry.record_last_updated
48
+
49
+
50
+ # TODO: BiblioPhilly MSS have keywords (not subjects, genre); include them?
51
+
52
+ {
53
+ ds_id: ds_id,
54
+ date_added: date_added,
55
+ date_last_updated: date_last_updated,
56
+ dated: dated,
57
+ cataloging_convention: cataloging_convention,
58
+ source_type: source_type,
59
+ holding_institution_ds_qid: holding_institution_ds_qid,
60
+ holding_institution_as_recorded: holding_institution_as_recorded,
61
+ holding_institution_id_number: holding_institution_id_number,
62
+ holding_institution_shelfmark: holding_institution_shelfmark,
63
+ link_to_holding_institution_record: link_to_holding_institution_record,
64
+ iiif_manifest: iiif_manifest,
65
+ production_date_as_recorded: production_date_as_recorded,
66
+ production_date: production_date,
67
+ century: century,
68
+ century_aat: century_aat,
69
+ physical_description: physical_description,
70
+ acknowledgments: acknowledgments,
71
+ note: note,
72
+ data_processed_at: data_processed_at,
73
+ data_source_modified: data_source_modified,
74
+ source_file: entry.filename
75
+ }.update build_term_maps DS::Extractor::TeiXml, record
76
+ end
77
+ end
78
+ end
79
+ end
data/lib/ds/mapper.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+ require_relative 'mapper/base_mapper'
3
+ require_relative 'mapper/ds_csv_mapper'
4
+ require_relative 'mapper/ds_mets_mapper'
5
+ require_relative 'mapper/marc_mapper'
6
+ require_relative 'mapper/tei_xml_mapper'
7
+
8
+ module DS
9
+ # The DS mapper namespace contains classes and methods for mapping
10
+ # DS data records DS imprort CSVs.
11
+ module Mapper
12
+ end
13
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Recon
4
+ module Constants
5
+ GENRE_HEADERS = %w{
6
+ genre_as_recorded
7
+ vocab
8
+ source_authority_uri
9
+ authorized_label
10
+ structured_value
11
+ }.freeze
12
+
13
+ LANGUAGES_HEADERS = %w{
14
+ language_as_recorded
15
+ language_code
16
+ authorized_label
17
+ structured_value
18
+ }.freeze
19
+
20
+ MATERIALS_HEADERS = %w{
21
+ material_as_recorded authorized_label structured_value
22
+ }.freeze
23
+
24
+ NAMES_HEADERS = %w{
25
+ name_as_recorded
26
+ role
27
+ name_agr
28
+ source_authority_uri
29
+ instance_of
30
+ authorized_label
31
+ structured_value
32
+ }.freeze
33
+
34
+ PLACES_HEADERS = %w{
35
+ place_as_recorded authorized_label structured_value
36
+ }.freeze
37
+
38
+ SUBJECT_HEADERS = %w{
39
+ subject_as_recorded
40
+ subfield_codes
41
+ vocab
42
+ source_authority_uri
43
+ authorized_label
44
+ structured_value
45
+ }.freeze
46
+
47
+ TITLE_HEADERS = %w{
48
+ title_as_recorded
49
+ title_as_recorded_agr
50
+ uniform_title_as_recorded
51
+ uniform_title_as_recorded_agr
52
+ authorized_label
53
+ }.freeze
54
+
55
+ end
56
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Recon
4
+ class DsCsvEnumerator < SourceEnumerator
5
+
6
+ # Iterates over each row in the CSV files and yields it to the provided block.
7
+ # @yield [row] yields each row in the CSV file
8
+ def each &block
9
+ files.each do |file|
10
+ CSV.foreach file, headers: true do |row|
11
+ yield row
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Recon
4
+ class DsMetsXmlEnumerator < SourceEnumerator
5
+
6
+ # Iterates over each row in the CSV files and yields it to the provided block.
7
+ # @yield [row] yields the parsed METS XML record
8
+ def each &block
9
+ process_xml files do |record|
10
+ yield record
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Recon
4
+ class MarcXmlEnumerator < SourceEnumerator
5
+
6
+ def each &block
7
+ process_xml files, remove_namespaces: true do |xml|
8
+ xml.xpath('//record').each do |record|
9
+ yield record
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require_relative 'recon_config'
4
+ require_relative 'type/recon_type'
5
+
6
+ module Recon
7
+
8
+ ##
9
+ # A class to build recon CSV rows from DS data sources.#
10
+
11
+ class ReconBuilder
12
+ attr_reader :source_type
13
+ attr_reader :files
14
+ attr_reader :out_dir
15
+
16
+ # A hash mapping DS data source types to their corresponding enumerator classes
17
+ #
18
+ # Keys:
19
+ # - DS::Constants::DS_CSV
20
+ # - DS::Constants::MARC_XML
21
+ # - DS::Constants::TEI_XML
22
+ # - DS::Constants::DS_METS
23
+ #
24
+ # Values:
25
+ # - Recon::DsCsvEnumerator
26
+ # - Recon::MarcXmlEnumerator
27
+ # - Recon::TeiXmlEnumerator
28
+ # - Recon::DsMetsXmlEnumerator
29
+ SOURCE_TYPE_ENUMERATORS = {
30
+ DS::Constants::DS_CSV => Recon::DsCsvEnumerator,
31
+ DS::Constants::MARC_XML => Recon::MarcXmlEnumerator,
32
+ DS::Constants::TEI_XML => Recon::TeiXmlEnumerator,
33
+ DS::Constants::DS_METS => Recon::DsMetsXmlEnumerator,
34
+ }
35
+
36
+ # A hash mapping DS data source types to their corresponding extractor classes
37
+ #
38
+ # Keys:
39
+ # - DS::Constants::DS_CSV
40
+ # - DS::Constants::MARC_XML
41
+ # - DS::Constants::TEI_XML
42
+ # - DS::Constants::DS_METS
43
+ #
44
+ # Values:
45
+ # - DS::Extractor::DsCsv
46
+ # - DS::Extractor::MarcXml
47
+ # - DS::Extractor::TeiXml
48
+ # - DS::Extractor::DsMetsXml
49
+ SOURCE_TYPE_EXTRACTORS = {
50
+ DS::Constants::MARC_XML => DS::Extractor::MarcXmlExtractor,
51
+ DS::Constants::DS_CSV => DS::Extractor::DsCsvExtractor,
52
+ DS::Constants::DS_METS => DS::Extractor::DsMetsXmlExtractor,
53
+ DS::Constants::TEI_XML => DS::Extractor::TeiXml
54
+ }
55
+
56
+ # @param [Symbol] source_type a valid DS data source type; e.g., DS::Constants::MARC_XML
57
+ # @param [Array] files an array of source file paths;e.g., +marc1.xml+, +marc2.xml+, etc.
58
+ # @param [String] out_dir a path to an output directory
59
+ def initialize source_type:, files:, out_dir:
60
+ @source_type = source_type
61
+ @files = files
62
+ @out_dir = out_dir
63
+ end
64
+
65
+ # @return [Recon::SourceEnumerator] an source enumerator type; e.g., Recon::MarcXmlEnumerator
66
+ def enumerator
67
+ return @enumerator if @enumerator.present?
68
+ klass = SOURCE_TYPE_ENUMERATORS[source_type]
69
+ raise "Unknown source type: #{source_type}" unless klass
70
+ @enumerator = klass.new files
71
+ end
72
+
73
+ # @return [DS::Extractor::MarcXml,DS::Extractor::DsCsvExtractor,DS::Extractor::DsMetsXml,DS::Extractor::TeiXml] an extractor type; e.g., DS::Extractor::MarcXml
74
+ def extractor
75
+ @extractor ||= SOURCE_TYPE_EXTRACTORS[source_type]
76
+ end
77
+
78
+ ##
79
+ # For each extracted term in of the given set type (like :places)
80
+ # yield the corresponding recon CSV row.
81
+ #
82
+ # Example:
83
+ #
84
+ # # for a ReconBuilder for set of TEI files
85
+ # CSV headers: true do |csv|
86
+ # csv << Recon::Type::Places.csv_headers
87
+ # recon_builder.each_recon(:places) do |recon|
88
+ # csv << recon
89
+ # end
90
+ # end
91
+ #
92
+ # @param [Symbol] set_name a recon set name, like :places
93
+ # @yield [Hash<Symbol, String>] a block that yields recon rows
94
+ def each_recon set_name, &block
95
+ items = Set.new
96
+ recon_type = Recon.find_recon_type set_name
97
+
98
+ enumerator.each do |record|
99
+ [recon_type.method_name].flatten.each do |name|
100
+ next unless extractor.respond_to? name.to_sym
101
+ extractor.send(name.to_sym, record).each do |item|
102
+ next if items.include? item
103
+ items << item
104
+ yield build_recon recon_type: recon_type, item: item
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ # Find a recon type configuration by name
111
+ #
112
+ # @param [String] name the name of the recon type to find
113
+ # @return [Recon::Type::ReconType, nil] the recon type configuration if found, nil otherwise
114
+ def find_recon_type name
115
+ Recon::RECON_TYPES.find { |config| config.set_name == name.to_s }
116
+ end
117
+
118
+ # A function that replaces delimiters in a value based on a given
119
+ # delimiter map.
120
+ #
121
+ # @param value [String] the value to be processed
122
+ # @param delimiter_map [Hash] a hash containing the old and new
123
+ # delimiters, e.g., for <tt>{ "|" => ";" }</tt> all +|+s will be
124
+ # replaced with +;+s
125
+ # @return [String] the processed value with delimiters replaced
126
+ def fix_delimiters value, delimiter_map = {}
127
+ return value if delimiter_map.blank?
128
+ val = ''
129
+ delimiter_map.each { |old, new| val = value.to_s.gsub old, new }
130
+ val
131
+ end
132
+
133
+ # Builds an array of recon CSV rows for each term in the given terms array
134
+ # using the given recon type configuration.
135
+ #
136
+ # @param terms [Array<DS::Extractor::BaseTerm>] an array of terms to build recon rows for
137
+ # @param recon_type [Recon::Type::ReconType] a recon type configuration
138
+ # @return [Array<Hash<Symbol,String>>] an array of recon CSV rows
139
+ def build_all_recons terms, recon_type
140
+ terms.map { |term| build_recon item: term, recon_type: recon_type }
141
+ end
142
+
143
+ # Build a single recon CSV row; e.g.,
144
+ #
145
+ # { :language_as_recorded => "Arabic",
146
+ # :language_code => "",
147
+ # :authorized_label => "Arabic",
148
+ # :structured_value => "Q13955" }
149
+ #
150
+ # @param [DS::Extractor::BaseTerm] item a term like a DS::Extractor::Place
151
+ # @param [Recon::Type::ReconType] recon_type a recon type configuration like Recon::Type::Places
152
+ # @return [Hash<Symbol,String>] a recon CSV row
153
+ def build_recon item:, recon_type:
154
+ recon_hash = item.to_h
155
+ key_values = recon_type.get_key_values recon_hash
156
+ recon_type.lookup_columns.each do |col|
157
+ val = Recon.lookup_single(recon_type.set_name, key_values: key_values, column: col)
158
+ recon_hash[col] = fix_delimiters val, recon_type.delimiter_map
159
+ end
160
+ recon_hash
161
+ end
162
+
163
+ private
164
+
165
+ ##
166
+ # Transform the recon hash
167
+ #
168
+ # Currently, we just replace the +as_recorded+ header with the type-specific
169
+ # +ReconType#as_recorded_column+, e.g., +language_as_recorded+
170
+ #
171
+ #
172
+ # @param recon_hash [Hash<Symbol,Object>] a hash of the CSV row
173
+ # @param recon_type [Recon::Type::ReconType] a ReconType like, ReconPlaces
174
+ # @return [Hash<Symbol,Object>]
175
+ def prep_row recon_hash:, recon_type:
176
+ row = recon_hash.dup
177
+ ar_value = row.delete :as_recorded
178
+ row[recon_type.as_recorded_column] = ar_value
179
+ row
180
+ end
181
+
182
+ end
183
+ end
@@ -0,0 +1,37 @@
1
+ require 'git'
2
+ require 'logger'
3
+
4
+ module Recon
5
+ module ReconData
6
+ def self.update!
7
+ repo_name = Settings.recon.git_local_name
8
+ url = Settings.recon.git_repo
9
+ branch = Settings.recon.git_branch || 'main'
10
+ logger = DS.logger
11
+
12
+ Dir.chdir local_dir do
13
+ unless File.exist? repo_name
14
+ puts Git.clone url, repo_name, branch: branch, remote: 'origin', log: logger
15
+ end
16
+ g = Git.open repo_name, log: logger
17
+ begin
18
+ puts g.fetch 'origin'
19
+ puts g.checkout branch
20
+ puts g.pull 'origin', branch
21
+ rescue Git::GitExecuteError => e
22
+ logger.warn { "Error executing git command" }
23
+ logger.warn { e.message }
24
+ STDERR.puts e.backtrace if ENV['DS_VERBOSE']
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.local_dir
30
+ Settings.recon.local_dir
31
+ end
32
+ def self.repo_dir
33
+ File.join DS.root, Settings.recon.git_local_name
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Recon
4
+ class ReconManager
5
+
6
+ attr_reader :out_dir
7
+ attr_reader :source_type
8
+ attr_reader :files
9
+
10
+ # Initialize the ReconManager.
11
+ #
12
+ # @param source_type [Symbol] a valid DS data source type; e.g., DS::Constants::MARC_XML
13
+ # @param out_dir [String] the output directory path
14
+ # @param files [Array<String>] an array of source file paths; e.g., +marc1.xml+, +marc2.xml+, etc.
15
+ # @return [void]
16
+ def initialize source_type:, out_dir:, files:
17
+ @source_type = source_type
18
+ @out_dir = out_dir
19
+ @files = files
20
+ @errors = {}
21
+ end
22
+
23
+ # Write all recon CSV files.
24
+ #
25
+ # @return [Array<String>] the list of output files
26
+ def write_all_csvs
27
+ outfiles = []
28
+ Recon::RECON_TYPES.each do |recon_type|
29
+ outfiles << write_csv(recon_type)
30
+ end
31
+ outfiles
32
+ end
33
+
34
+ # Write a CSV file for a specific recon type.
35
+ #
36
+ # @param recon_type [Recon::Type::ReconType] the type of reconciliation data
37
+ # @return [String] the path to the output CSV file
38
+ def write_csv recon_type
39
+ outfile = File.join out_dir, "#{recon_type.set_name}.csv"
40
+ CSV.open(outfile, 'w+', headers: true) do |csv|
41
+ row_num = 0
42
+ csv << recon_type.recon_csv_headers
43
+ recon_builder.each_recon(recon_type.set_name) do |recon|
44
+ errors = Recon.validate_row(recon_type, recon, row_num: row_num += 1)
45
+ add_errors recon_type, errors unless errors.blank?
46
+ csv << recon
47
+ end
48
+ end
49
+ if has_errors?(recon_type)
50
+ raise DSError, "Error writing #{outfile}:\n#{errors_for_type(recon_type).join("\n")}"
51
+ end
52
+ outfile
53
+ end
54
+
55
+ # Initializes and returns a new instance of the Recon::ReconBuilder class with the specified output directory, source type, and files.
56
+ #
57
+ # @return [Recon::ReconBuilder] A new instance of the Recon::ReconBuilder class.
58
+ def recon_builder
59
+ @recon_builder ||= Recon::ReconBuilder.new(
60
+ out_dir: out_dir, source_type: source_type, files: files
61
+ )
62
+ end
63
+
64
+ # Adds errors to the specified recon type.
65
+ #
66
+ # @param recon_type [ReconType] the recon type to add errors to
67
+ # @param messages [Array<String>] the errors to add
68
+ # @return [void]
69
+ def add_errors recon_type, messages
70
+ @errors[recon_type.set_name] ||= []
71
+ @errors[recon_type.set_name] += messages
72
+ nil
73
+ end
74
+
75
+ # Returns true if errors exist for the specified recon type.
76
+ #
77
+ # @param recon_type [ReconType] the recon type
78
+ # @return [Boolean] true if errors exist
79
+ def has_errors? recon_type
80
+ errors_for_type(recon_type).present?
81
+ end
82
+
83
+ # Returns the list of errors for the specified recon type.
84
+ #
85
+ # @param recon_type [ReconType] the recon type
86
+ # @return [Array<String>] the list of errors
87
+ def errors_for_type recon_type
88
+ @errors[recon_type.set_name]
89
+ end
90
+
91
+ end
92
+ end