ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Extractor
|
5
|
+
class Title < BaseTerm
|
6
|
+
attr_accessor :vernacular
|
7
|
+
attr_accessor :title_type
|
8
|
+
attr_accessor :uniform_title
|
9
|
+
attr_accessor :uniform_title_vernacular
|
10
|
+
|
11
|
+
# Initializes a new Title object.
|
12
|
+
#
|
13
|
+
# Parameters:
|
14
|
+
# - as_recorded: the title as recorded
|
15
|
+
# - vernacular: the vernacular title (default is nil)
|
16
|
+
# - uniform_title: the uniform title (default is nil)
|
17
|
+
# - uniform_title_vernacular: the vernacular uniform title (default is nil)
|
18
|
+
#
|
19
|
+
# Returns:
|
20
|
+
# - A new Title object
|
21
|
+
def initialize as_recorded:, vernacular: nil, uniform_title: nil, uniform_title_vernacular: nil
|
22
|
+
@vernacular = vernacular
|
23
|
+
@uniform_title = uniform_title
|
24
|
+
@uniform_title_vernacular = uniform_title_vernacular
|
25
|
+
super(as_recorded: as_recorded)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns an array containing the title as recorded, vernacular title, uniform title, and vernacular uniform title.
|
29
|
+
#
|
30
|
+
# @return [Array] the title as an array
|
31
|
+
def to_a
|
32
|
+
# title_type is not included
|
33
|
+
[as_recorded, vernacular, uniform_title, uniform_title_vernacular]
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns a hash representation of the title object.
|
37
|
+
#
|
38
|
+
# Keys are :as_recorded, :title_as_recorded_agr, :uniform_title_as_recorded, :uniform_title_as_recorded_agr
|
39
|
+
#
|
40
|
+
# @return [Hash] the title as a hash
|
41
|
+
def to_h
|
42
|
+
{
|
43
|
+
title_as_recorded: as_recorded,
|
44
|
+
as_recorded: as_recorded,
|
45
|
+
title_as_recorded_agr: vernacular,
|
46
|
+
uniform_title_as_recorded: uniform_title,
|
47
|
+
uniform_title_as_recorded_agr: uniform_title_vernacular
|
48
|
+
}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Extractor
|
5
|
+
class XmlRecordLocator < DS::Extractor::BaseRecordLocator
|
6
|
+
|
7
|
+
attr_accessor :namespaces
|
8
|
+
|
9
|
+
def initialize namespaces: DS::Constants::XML_NAMESPACES
|
10
|
+
@namespaces = namespaces
|
11
|
+
super()
|
12
|
+
end
|
13
|
+
|
14
|
+
def locate_record xml, id, id_location
|
15
|
+
xpath = id_location.gsub(/ID_PLACEHOLDER/, id)
|
16
|
+
# try with namespaces
|
17
|
+
record = try_locate_record xml, xpath, namespaces: namespaces
|
18
|
+
return record if record.present?
|
19
|
+
|
20
|
+
# try without providing namespaces
|
21
|
+
record = try_locate_record xml, xpath
|
22
|
+
return record if record.present?
|
23
|
+
|
24
|
+
# strip namespaces and try one last time
|
25
|
+
xml.remove_namespaces!
|
26
|
+
try_locate_record xml, xpath
|
27
|
+
end
|
28
|
+
|
29
|
+
def try_locate_record xml, xpath, namespaces: nil
|
30
|
+
xml.xpath xpath, namespaces
|
31
|
+
rescue Nokogiri::XML::XPath::SyntaxError => e
|
32
|
+
add_error e.message
|
33
|
+
raise unless e.message =~ /undefined namespace prefix/i
|
34
|
+
[]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/ds/extractor.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'extractor/base_term'
|
4
|
+
require_relative 'extractor/genre'
|
5
|
+
require_relative 'extractor/material'
|
6
|
+
require_relative 'extractor/name'
|
7
|
+
require_relative 'extractor/place'
|
8
|
+
require_relative 'extractor/subject'
|
9
|
+
require_relative 'extractor/title'
|
10
|
+
require_relative 'extractor/language'
|
11
|
+
require_relative 'extractor/base_record_locator'
|
12
|
+
require_relative 'extractor/xml_record_locator'
|
13
|
+
require_relative 'extractor/csv_record_locator'
|
14
|
+
|
15
|
+
module DS
|
16
|
+
# Module for DS Extractor classes, which are responsible for extracting
|
17
|
+
# import CSV rows from source records.
|
18
|
+
#
|
19
|
+
# Extractors are used by {DS::Mapper::BaseMapper} instances to extract
|
20
|
+
# data from a source records and by {Recon::ReconBuilder} instances
|
21
|
+
# to extract data from DS data sources for recon CSVs.
|
22
|
+
module Extractor
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module DS
|
2
|
+
##
|
3
|
+
# Class for access configure institutions. Values from
|
4
|
+
# `config/institutions.yml`
|
5
|
+
#
|
6
|
+
# File contents look like:
|
7
|
+
#
|
8
|
+
# ---
|
9
|
+
# institutions:
|
10
|
+
# Q814779:
|
11
|
+
# - Beinecke Rare Book & Manuscript Library
|
12
|
+
# - beinecke
|
13
|
+
# Q995265:
|
14
|
+
# - Bryn Mawr College
|
15
|
+
# - brynmawr
|
16
|
+
#
|
17
|
+
# +DS.configure!+ must be invoked before this class is accessed.
|
18
|
+
module Institutions
|
19
|
+
@@names_to_qids = nil
|
20
|
+
|
21
|
+
##
|
22
|
+
# Return the contents of `config/institutions.yml' as hash with the
|
23
|
+
# institution names as keys and the Wikidata QIDs as values.
|
24
|
+
#
|
25
|
+
# @return [Hash]
|
26
|
+
def self.names_to_qids
|
27
|
+
@@names_to_qids ||= Settings.institutions.inject({}) do |h, qid_names|
|
28
|
+
qid = qid_names.first.to_s
|
29
|
+
qid_names.last.inject(h) { |j, name| j.merge(name => qid) }
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Return the QID for the give institution name/alias.
|
35
|
+
#
|
36
|
+
# @param [String] inst_alias a name of the institution
|
37
|
+
# @return [String] the institution Wikidata QID
|
38
|
+
def self.find_qid inst_alias
|
39
|
+
# try without changes; and then normalize
|
40
|
+
names_to_qids[inst_alias] or
|
41
|
+
names_to_qids[inst_alias.to_s.strip] or
|
42
|
+
names_to_qids[inst_alias.to_s.strip.downcase]
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Return the preferred name of the institution for the given alias.
|
47
|
+
#
|
48
|
+
# @param [String] inst_alias a name of the institution
|
49
|
+
# @return [String] the first list name of the institution
|
50
|
+
def self.preferred_name inst_alias
|
51
|
+
qid = find_qid inst_alias
|
52
|
+
Settings.institutions[qid.to_sym].first
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Manifest
|
5
|
+
##
|
6
|
+
# A {DS::Manifest::BaseIdValidator} is a base class for a
|
7
|
+
# cacheable ID validator for sources. The validator is responsible
|
8
|
+
# for opening and caching source files and dtermining that one
|
9
|
+
# record is found for each source +id+ at the specified
|
10
|
+
# +id_location+ in the parsed source.
|
11
|
+
#
|
12
|
+
# The motivation for this class is to handle ID validation for
|
13
|
+
# source types that can have multiple records per source file,
|
14
|
+
# saving the time required to parse the source file for each check.
|
15
|
+
#
|
16
|
+
# Concrete subclasses of {DS::Manifest::BaseIdValidator} must implement
|
17
|
+
#
|
18
|
+
# - +#locate_record+, required this class
|
19
|
+
#
|
20
|
+
class BaseIdValidator
|
21
|
+
|
22
|
+
attr_reader :errors
|
23
|
+
attr_reader :source
|
24
|
+
|
25
|
+
##
|
26
|
+
# Create a new ID Validator
|
27
|
+
#
|
28
|
+
# @param source [DS::Source::BaseSource] the source to validate
|
29
|
+
# @return [void]
|
30
|
+
def initialize source
|
31
|
+
@source = source
|
32
|
+
@errors = []
|
33
|
+
end
|
34
|
+
|
35
|
+
# Checks if the given file path, id, and id location are valid.
|
36
|
+
#
|
37
|
+
# @param file_path [String] The path to the file.
|
38
|
+
# @param id [String] The id to check.
|
39
|
+
# @param id_location [String] The location of the id.
|
40
|
+
# @return [Boolean] Returns true if the records size is equal to 1, false otherwise.
|
41
|
+
def valid? file_path, id, id_location
|
42
|
+
records = locate_record file_path, id, id_location
|
43
|
+
|
44
|
+
return true if records.size == 1
|
45
|
+
handle_count_error records.size, id, id_location
|
46
|
+
false
|
47
|
+
end
|
48
|
+
|
49
|
+
# Locates a record based on the given source path, ID, and ID location.
|
50
|
+
#
|
51
|
+
# @param source_path [String] the path to the source file
|
52
|
+
# @param id [String] the ID of the record
|
53
|
+
# @param id_location [String] the location of the ID within the record
|
54
|
+
# @raise [NotImplementedError] this method is not implemented and should be overridden
|
55
|
+
# @return [Array<Object>] an array of objects for each record
|
56
|
+
def locate_record source_path, id, id_location
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
|
60
|
+
def handle_count_error count, inst_id, location_in_source
|
61
|
+
return if count == 1
|
62
|
+
|
63
|
+
if count > 1
|
64
|
+
add_error "ERROR: Multiple records (#{count}) found for id: #{inst_id} (location: #{location_in_source})"
|
65
|
+
elsif count == 0
|
66
|
+
add_error "ERROR: No records found for id: #{inst_id} (location: #{location_in_source})"
|
67
|
+
end
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def add_error message
|
72
|
+
@errors << message
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module DS
|
3
|
+
module Manifest
|
4
|
+
module Constants
|
5
|
+
include DS
|
6
|
+
|
7
|
+
INSTITUTION_DS_QID = 'holding_institution_ds_qid'
|
8
|
+
FILENAME = 'filename'
|
9
|
+
INSTITUTION_WIKIDATA_LABEL = 'holding_institution_wikidata_label'
|
10
|
+
SOURCE_TYPE = 'source_data_type'
|
11
|
+
DS_ID = 'ds_id'
|
12
|
+
DATED = 'dated'
|
13
|
+
INSTITUTIONAL_ID = 'holding_institution_institutional_id'
|
14
|
+
INSTITUTIONAL_ID_LOCATION_IN_SOURCE = 'institutional_id_location_in_source'
|
15
|
+
RECORD_LAST_UPDATED = 'record_last_updated'
|
16
|
+
CALL_NUMBER = 'call_number'
|
17
|
+
TITLE = 'title'
|
18
|
+
IIIF_MANIFEST_URL = 'iiif_manifest_url'
|
19
|
+
LINK_TO_INSTITUTIONAL_RECORD = 'link_to_institutional_record'
|
20
|
+
MANIFEST_GENERATED_AT = 'manifest_generated_at'
|
21
|
+
|
22
|
+
MANIFEST_COLUMNS = [
|
23
|
+
INSTITUTION_DS_QID,
|
24
|
+
INSTITUTION_WIKIDATA_LABEL,
|
25
|
+
FILENAME,
|
26
|
+
SOURCE_TYPE,
|
27
|
+
DS_ID,
|
28
|
+
DATED,
|
29
|
+
INSTITUTIONAL_ID,
|
30
|
+
INSTITUTIONAL_ID_LOCATION_IN_SOURCE,
|
31
|
+
RECORD_LAST_UPDATED,
|
32
|
+
CALL_NUMBER,
|
33
|
+
TITLE,
|
34
|
+
IIIF_MANIFEST_URL,
|
35
|
+
LINK_TO_INSTITUTIONAL_RECORD,
|
36
|
+
MANIFEST_GENERATED_AT
|
37
|
+
].freeze
|
38
|
+
|
39
|
+
REQUIRED_VALUES = [
|
40
|
+
INSTITUTION_DS_QID,
|
41
|
+
FILENAME,
|
42
|
+
INSTITUTION_WIKIDATA_LABEL,
|
43
|
+
SOURCE_TYPE,
|
44
|
+
INSTITUTIONAL_ID,
|
45
|
+
INSTITUTIONAL_ID_LOCATION_IN_SOURCE,
|
46
|
+
RECORD_LAST_UPDATED,
|
47
|
+
CALL_NUMBER,
|
48
|
+
MANIFEST_GENERATED_AT
|
49
|
+
].freeze
|
50
|
+
|
51
|
+
URI_COLUMNS = [
|
52
|
+
LINK_TO_INSTITUTIONAL_RECORD,
|
53
|
+
IIIF_MANIFEST_URL
|
54
|
+
].freeze
|
55
|
+
|
56
|
+
QID_COLUMNS = [
|
57
|
+
INSTITUTION_DS_QID
|
58
|
+
].freeze
|
59
|
+
|
60
|
+
DATE_TIME_COLUMNS = [
|
61
|
+
RECORD_LAST_UPDATED,
|
62
|
+
MANIFEST_GENERATED_AT
|
63
|
+
].freeze
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Manifest
|
5
|
+
class DsCsvIdValidator < BaseIdValidator
|
6
|
+
|
7
|
+
def locate_record source_path, id, id_location
|
8
|
+
locator = DS::Extractor::CsvRecordLocator.new
|
9
|
+
csv = source.load_source source_path
|
10
|
+
csv.rewind
|
11
|
+
locator.locate_record csv, id, id_location
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative './constants'
|
4
|
+
module DS
|
5
|
+
module Manifest
|
6
|
+
##
|
7
|
+
# The manifest Entry provides information to validate delivered data
|
8
|
+
# and to drive the data extraction process. Specifically, each
|
9
|
+
# line of the manifest:
|
10
|
+
#
|
11
|
+
# 1. Provides information often not present in standard a location
|
12
|
+
# in the source record, like shelfmark, source type (MARC XML,
|
13
|
+
# TEI XML, etc.), link to a IIIF manifest, and link to the
|
14
|
+
# institution's record in an OPAC or on the institution's
|
15
|
+
# website
|
16
|
+
#
|
17
|
+
# 2. Gives the file name for the record present in the delivered
|
18
|
+
# set of records
|
19
|
+
#
|
20
|
+
# 3. Provides information needed to validate the source record:
|
21
|
+
# its presence in the delivered data, correspondence of the
|
22
|
+
# source file(s) to identifying information, etc.)
|
23
|
+
#
|
24
|
+
#
|
25
|
+
class Entry
|
26
|
+
include DS::Manifest::Constants
|
27
|
+
|
28
|
+
attr_reader :row
|
29
|
+
attr_reader :manifest
|
30
|
+
##
|
31
|
+
# @param [CSV::Row] row a manifest CSV row
|
32
|
+
# @param [DS::Manifest::Manifest] manifest the parent manifest
|
33
|
+
def initialize row, manifest
|
34
|
+
@row = row
|
35
|
+
@manifest = manifest
|
36
|
+
end
|
37
|
+
|
38
|
+
def [] key
|
39
|
+
row[key]
|
40
|
+
end
|
41
|
+
|
42
|
+
def institution_ds_qid
|
43
|
+
row[INSTITUTION_DS_QID]
|
44
|
+
end
|
45
|
+
|
46
|
+
# FILENAME = 'filename'
|
47
|
+
def filename
|
48
|
+
row[FILENAME]
|
49
|
+
end
|
50
|
+
|
51
|
+
def institution_wikidata_label
|
52
|
+
row[INSTITUTION_WIKIDATA_LABEL]
|
53
|
+
end
|
54
|
+
|
55
|
+
def source_type
|
56
|
+
row[SOURCE_TYPE]
|
57
|
+
end
|
58
|
+
# DS_ID = 'ds_id'
|
59
|
+
def ds_id
|
60
|
+
row[DS_ID]
|
61
|
+
end
|
62
|
+
|
63
|
+
# DATED = 'dated'
|
64
|
+
def dated
|
65
|
+
row[DATED]
|
66
|
+
end
|
67
|
+
# INSTITUTIONAL_ID = 'holding_institution_institutional_id'
|
68
|
+
def institutional_id
|
69
|
+
row[INSTITUTIONAL_ID]
|
70
|
+
end
|
71
|
+
|
72
|
+
# INSTITUTIONAL_ID_LOCATION_IN_SOURCE = 'institutional_id_location_in_source'
|
73
|
+
def institutional_id_location_in_source
|
74
|
+
row[INSTITUTIONAL_ID_LOCATION_IN_SOURCE]
|
75
|
+
end
|
76
|
+
# RECORD_LAST_UPDATED = 'record_last_updated'
|
77
|
+
def record_last_updated
|
78
|
+
row[RECORD_LAST_UPDATED]
|
79
|
+
end
|
80
|
+
#
|
81
|
+
# CALL_NUMBER = 'call_number'
|
82
|
+
def call_number
|
83
|
+
row[CALL_NUMBER]
|
84
|
+
end
|
85
|
+
# TITLE = 'title'
|
86
|
+
def title
|
87
|
+
row[TITLE]
|
88
|
+
end
|
89
|
+
|
90
|
+
# IIIF_MANIFEST_URL = 'iiif_manifest_url'
|
91
|
+
def iiif_manifest_url
|
92
|
+
return '' unless row[IIIF_MANIFEST_URL]
|
93
|
+
# there may be multiple manifests; split & join with pipes
|
94
|
+
row[IIIF_MANIFEST_URL].split(/[ ;|]+/).join('|')
|
95
|
+
end
|
96
|
+
# LINK_TO_INSTITUTIONAL_RECORD = 'link_to_institutional_record'
|
97
|
+
def link_to_institutional_record
|
98
|
+
row[LINK_TO_INSTITUTIONAL_RECORD]
|
99
|
+
end
|
100
|
+
# MANIFEST_GENERATED_AT = 'manifest_generated_at'
|
101
|
+
def manifest_generated_at
|
102
|
+
row[MANIFEST_GENERATED_AT]
|
103
|
+
end
|
104
|
+
|
105
|
+
def manifest_path
|
106
|
+
manifest.present? && manifest.path
|
107
|
+
end
|
108
|
+
|
109
|
+
def dated?
|
110
|
+
dated.to_s.strip.downcase == 'true'
|
111
|
+
end
|
112
|
+
|
113
|
+
def to_h
|
114
|
+
{
|
115
|
+
institution_ds_qid: institution_ds_qid,
|
116
|
+
institution_wikidata_label: institution_wikidata_label,
|
117
|
+
ds_id: ds_id,
|
118
|
+
call_number: call_number,
|
119
|
+
institutional_id: institutional_id,
|
120
|
+
title: title,
|
121
|
+
link_to_institutional_record: link_to_institutional_record,
|
122
|
+
iiif_manifest_url: iiif_manifest_url,
|
123
|
+
record_last_updated: record_last_updated,
|
124
|
+
source_type: source_type,
|
125
|
+
filename: filename,
|
126
|
+
dated: dated?,
|
127
|
+
manifest_generated_at: manifest_generated_at,
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module DS
|
6
|
+
module Manifest
|
7
|
+
##
|
8
|
+
# A Manifest is a collection of {DS::Manifest::Entry} instances.
|
9
|
+
#
|
10
|
+
# This class loads a manifest CSV and provides an {#each} method
|
11
|
+
# to iterate over the entries in the manifest.
|
12
|
+
#
|
13
|
+
class Manifest
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
attr_reader :csv_path
|
17
|
+
attr_reader :source_dir
|
18
|
+
|
19
|
+
##
|
20
|
+
# Create a new Manifest instance. If +dir+ is not provided,
|
21
|
+
# directory containing source files must the same as the
|
22
|
+
# +manifest_csv+ directory.
|
23
|
+
#
|
24
|
+
# @param [String] csv_path manifest CSV path
|
25
|
+
# @param [String] dir optional path to the directory containing the
|
26
|
+
# source file(s); if
|
27
|
+
# @return [DS::Manifest::Manifest] a new Manifest instance
|
28
|
+
def initialize csv_path, dir = nil
|
29
|
+
@csv_path = csv_path
|
30
|
+
@source_dir = get_source_dir dir
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# The headers from the parsed Manifest CSV.
|
35
|
+
# @return [Array<String>]
|
36
|
+
def headers
|
37
|
+
csv.first.headers
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# @yield [DS::Manifest::Entry] entry representation of the manifest row
|
42
|
+
def each &block
|
43
|
+
csv.each do |row|
|
44
|
+
yield DS::Manifest::Entry.new row, self
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Return the String path of the directory expected to contain
|
50
|
+
# the source records. If +dir+ is present, return
|
51
|
+
# +dir+; otherwise, return the directory of the manifest
|
52
|
+
# CSV.
|
53
|
+
#
|
54
|
+
# @param [String] dir a source directory path or +nil+
|
55
|
+
# @return [String] the directory containing source files
|
56
|
+
def get_source_dir dir
|
57
|
+
return dir if dir.present?
|
58
|
+
File.dirname csv.path
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Return a CSV::Table for +manifest_csv+. Determine +manifest_csv+
|
63
|
+
# type and return the value (if a CSV::Table) or return the parsed
|
64
|
+
# value as appropriate.
|
65
|
+
#
|
66
|
+
# @return [CSV::Table] the parsed manifest
|
67
|
+
def csv
|
68
|
+
@csv ||= CSV.open csv_path, 'r', headers: true
|
69
|
+
@csv.rewind
|
70
|
+
@csv
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|