RubyGems - ds-convert - Versions diffs - 0.1.1 - Mend

ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

checksums.yaml +7 -0
data/README.md +294 -0
data/Rakefile +12 -0
data/config/settings.yml +150 -0
data/exe/ds-convert +149 -0
data/exe/ds-recon +275 -0
data/exe/ds-validate-csv +40 -0
data/exe/marc-mrc-to-xml.rb +80 -0
data/lib/ds/cli.rb +102 -0
data/lib/ds/constants.rb +166 -0
data/lib/ds/converter/converter.rb +124 -0
data/lib/ds/converter/writer.rb +50 -0
data/lib/ds/converter.rb +7 -0
data/lib/ds/csv_util.rb +43 -0
data/lib/ds/data/berkeley-arks.txt +4000 -0
data/lib/ds/data/getty-aat-centuries.csv +71 -0
data/lib/ds/data/iiif_manifests.csv +122 -0
data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
data/lib/ds/ds_error.rb +1 -0
data/lib/ds/extractor/base_record_locator.rb +24 -0
data/lib/ds/extractor/base_term.rb +79 -0
data/lib/ds/extractor/csv_record_locator.rb +13 -0
data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
data/lib/ds/extractor/genre.rb +45 -0
data/lib/ds/extractor/language.rb +31 -0
data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
data/lib/ds/extractor/material.rb +12 -0
data/lib/ds/extractor/name.rb +50 -0
data/lib/ds/extractor/place.rb +11 -0
data/lib/ds/extractor/subject.rb +58 -0
data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
data/lib/ds/extractor/title.rb +52 -0
data/lib/ds/extractor/xml_record_locator.rb +38 -0
data/lib/ds/extractor.rb +24 -0
data/lib/ds/institutions.rb +55 -0
data/lib/ds/manifest/base_id_validator.rb +76 -0
data/lib/ds/manifest/constants.rb +67 -0
data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
data/lib/ds/manifest/entry.rb +133 -0
data/lib/ds/manifest/manifest.rb +74 -0
data/lib/ds/manifest/manifest_validator.rb +256 -0
data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
data/lib/ds/manifest.rb +30 -0
data/lib/ds/mapper/base_mapper.rb +221 -0
data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
data/lib/ds/mapper/marc_mapper.rb +87 -0
data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
data/lib/ds/mapper.rb +13 -0
data/lib/ds/recon/constants.rb +56 -0
data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
data/lib/ds/recon/recon_builder.rb +183 -0
data/lib/ds/recon/recon_data.rb +37 -0
data/lib/ds/recon/recon_manager.rb +92 -0
data/lib/ds/recon/source_enumerator.rb +21 -0
data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
data/lib/ds/recon/type/all_subjects.rb +18 -0
data/lib/ds/recon/type/genres.rb +50 -0
data/lib/ds/recon/type/languages.rb +38 -0
data/lib/ds/recon/type/materials.rb +40 -0
data/lib/ds/recon/type/named_subjects.rb +20 -0
data/lib/ds/recon/type/names.rb +65 -0
data/lib/ds/recon/type/places.rb +40 -0
data/lib/ds/recon/type/recon_type.rb +136 -0
data/lib/ds/recon/type/splits.rb +34 -0
data/lib/ds/recon/type/subjects.rb +65 -0
data/lib/ds/recon/type/titles.rb +38 -0
data/lib/ds/recon/url_lookup.rb +52 -0
data/lib/ds/recon.rb +292 -0
data/lib/ds/source/base_source.rb +32 -0
data/lib/ds/source/ds_csv.rb +18 -0
data/lib/ds/source/ds_mets_xml.rb +20 -0
data/lib/ds/source/marc_xml.rb +22 -0
data/lib/ds/source/source_cache.rb +69 -0
data/lib/ds/source/tei_xml.rb +22 -0
data/lib/ds/source.rb +20 -0
data/lib/ds/util/cache.rb +111 -0
data/lib/ds/util/csv_validator.rb +209 -0
data/lib/ds/util/csv_writer.rb +42 -0
data/lib/ds/util/strings.rb +194 -0
data/lib/ds/util.rb +37 -0
data/lib/ds/version.rb +5 -0
data/lib/ds.rb +237 -0
metadata +246 -0

data/lib/ds/recon/source_enumerator.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Recon
+  class SourceEnumerator
+    include DS::Util
+    include Enumerable
+    attr_accessor :files
+    # Initialize the SourceEnumerator with the given files.
+    # @param [Array] files an array of source file paths
+    def initialize files
+      @files = *files
+    end
+    ##
+    # @yield record a record of the SourceEnumerator's type (MARC XML, CSV::Row, etc.)
+    def each &block
+      raise NotImplementedError
+    end
+  end
+end

data/lib/ds/recon/tei_xml_enumerator.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+module Recon
+  class TeiXmlEnumerator < SourceEnumerator
+    def each &block
+      process_xml files, remove_namespaces: true do |xml|
+        xml.xpath('//TEI').each do |record|
+          yield record
+        end
+      end
+    end
+  end
+end

data/lib/ds/recon/type/all_subjects.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'nokogiri'
+module Recon
+  module Type
+    ##
+    # Lookup subjects and named subjects for import CSV output
+    #
+    class AllSubjects < Recon::Type::Subjects
+      extend DS::Util
+      SET_NAME = :'all-subjects'
+      METHOD_NAME = %i{ extract_all_subjects  }
+    end
+  end
+end

data/lib/ds/recon/type/genres.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'nokogiri'
+module Recon
+  module Type
+    ##
+    # Extract genre terms for reconciliation CSV output.
+    #
+    # Return a two-dimensional array, each row is a term; and each row has
+    # three columns: term, vocab, and authority number.
+    #
+    class Genres
+      extend DS::Util
+      include ReconType
+      SET_NAME = :genres
+      RECON_CSV_HEADERS = %i{
+      genre_as_recorded
+      vocab
+      source_authority_uri
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      KEY_COLUMNS = %i{
+      genre_as_recorded
+      vocab
+    }
+      AS_RECORDED_COLUMN = :genre_as_recorded
+      DELIMITER_MAP = { '|' => ';' }
+      METHOD_NAME = %i{ extract_genres }
+      BALANCED_COLUMNS = {
+        genres: %i{ structured_value authorized_label }
+      }
+    end
+  end
+end

data/lib/ds/recon/type/languages.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module Recon
+  module Type
+    class Languages
+      extend DS::Util
+      include Recon::Type::ReconType
+      SET_NAME = :languages
+      RECON_CSV_HEADERS = %i{
+      language_as_recorded
+      language_code
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      KEY_COLUMNS = %i{
+      language_as_recorded
+    }
+      AS_RECORDED_COLUMN = :language_as_recorded
+      DELIMITER_MAP = {}
+      METHOD_NAME = %i{ extract_languages }
+      BALANCED_COLUMNS = { languages: %w{ structured_value authorized_label } }
+    end
+  end
+end

data/lib/ds/recon/type/materials.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'nokogiri'
+module Recon
+  module Type
+    class Materials
+      extend DS::Util
+      include ReconType
+      SET_NAME = :materials
+      RECON_CSV_HEADERS = %i{
+      material_as_recorded
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      KEY_COLUMNS = %i{
+      material_as_recorded
+    }
+      METHOD_NAME = %i{ extract_materials }
+      AS_RECORDED_COLUMN = :material_as_recorded
+      DELIMITER_MAP = { '|' => ';' }
+      BALANCED_COLUMNS = { materials: %w{ structured_value authorized_label } }
+    end
+  end
+end

data/lib/ds/recon/type/named_subjects.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'nokogiri'
+module Recon
+  module Type
+    ##
+    # Extract named subjects for reconciliation CSV output.
+    #
+    # Return a two-dimensional array, each row is a term; and each row has
+    # two columns: subject and authority number.
+    #
+    class NamedSubjects < Recon::Type::Subjects
+      extend DS::Util
+      SET_NAME = :'named-subjects'
+      METHOD_NAME = %i{ extract_named_subjects }
+    end
+  end
+end

data/lib/ds/recon/type/names.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'nokogiri'
+module Recon
+  module Type
+    class Names
+      extend DS::Util
+      include ReconType
+      SET_NAME = :names
+      RECON_CSV_HEADERS = %i{
+      name_as_recorded
+      role
+      name_agr
+      source_authority_uri
+      instance_of
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      source_authority_uri
+      instance_of
+      ds_qid
+    }
+      KEY_COLUMNS = %i{
+      name_as_recorded
+    }
+      AS_RECORDED_COLUMN = :name_as_recorded
+      DELIMITER_MAP = {}
+      METHOD_NAME = %i{ extract_authors extract_artists extract_scribes extract_former_owners }
+      BALANCED_COLUMNS = { names: %i{ structured_value authorized_label instance_of } }
+    end
+    class Authors < Names
+      METHOD_NAME = %i{ extract_authors }.freeze
+    end
+    class Artists < Names
+      METHOD_NAME = %i{ extract_artists }.freeze
+    end
+    class AssociatedAgents < Names
+      METHOD_NAME = %i{ extract_associated_agents }.freeze
+    end
+    class FormerOwners < Names
+      METHOD_NAME = %i{ extract_former_owners }.freeze
+    end
+    class Scribes < Names
+      METHOD_NAME = %i{ extract_scribes }.freeze
+    end
+  end
+end

data/lib/ds/recon/type/places.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'nokogiri'
+module Recon
+  module Type
+    class Places
+      extend DS::Util
+      include ReconType
+      SET_NAME = :places
+      RECON_CSV_HEADERS = %i{ place_as_recorded authorized_label structured_value ds_qid}
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      ds_qid
+    }
+    KEY_COLUMNS = %i{ place_as_recorded }
+    AS_RECORDED_COLUMN = :place_as_recorded
+    DELIMITER_MAP = { '|' => ';' }
+    METHOD_NAME = %i{ extract_places }
+    BALANCED_COLUMNS = { places: %i{ structured_value authorized_label } }
+    def self.lookup places, from_column: 'structured_value'
+      places.map { |place|
+        key_values = get_key_values place.to_h
+        place_uris = Recon.lookup_single SET_NAME, key_values: key_values, column: from_column
+        place_uris.to_s.gsub '|', ';'
+      }
+    end
+    end
+  end
+end

data/lib/ds/recon/type/recon_type.rb ADDED Viewed

@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+module Recon
+  module Type
+    ##
+    # The Recon::Type::ReconType module should be included in all
+    # Recon::Type classes. It provides access to recon type
+    # configuration information. Its methods support the lookup and
+    # enrichment of DS::Extractor::BaseTerm object values.
+    #
+    # ReconType methods define recon CSV columns, the c, the columns
+    # retrieved fom the DS data dictionaries, the lookup key columns,
+    # the import CSV as recorded column (eg., author_as_recorded),
+    # and, for validation purposes, the balanced columns; that is,
+    # those columns in the recon CSVs that must have equal numbers of
+    # subfields in each row.
+    #
+    # Classes that include Recon::Type::ReconType should define these
+    # constants
+    #
+    #   SET_NAME :: the name of the recon set; e.g., :places
+    #   RECON_CSV_HEADERS :: the recon CSV headers; e.g., [:place_as_recorded, :authorized_label, :structured_value, :ds_qid]
+    #   LOOKUP_COLUMNS :: the  columns to extract from the data dictionaries; e.g., [:authorized_label, :structured_value, :ds_qid]
+    #   KEY_COLUMNS :: the key columns in the recon CSV; e.g., [:place_as_recorded]
+    #   AS_RECORDED_COLUMN :: the column in the recon CSV that holds the as-recorded value; e.g., :author_as_recorded
+    #   DELIMITER_MAP :: a map of delimiters to replace in the recon CSV values: { ORIGINAL => REPLACEMENT}; e.g., { '|' => ';' }
+    #   METHOD_NAME :: the name of the DS::Extractor methods; e.g., [:extract_places]
+    #   BALANCED_COLUMNS :: the columns that must have equal numbers of subfields; e.g., { places: [:structured_value, :authorized_label] }
+    #
+    module ReconType
+      def self.included base
+        base.extend ClassMethods
+      end
+      module ClassMethods
+        # Returns the set name of the recon set; e.g., :places
+        #
+        # Used to find a recon type configuration by name; either
+        # the ReconType (like Recon::Type::Places) or the path to the
+        # recon data dictionary CSV in the ds-data git repo as defined
+        # in config/settings.yml:
+        #
+        #   ds:
+        #     recon:
+        #       ...
+        #       sets:
+        #         - name: :places
+        #           repo_path: terms/reconciled/places.csv
+        #           key_column: place_as_recorded
+        #           ...
+        #
+        # @return [Symbol] the set name
+        def set_name
+          self::SET_NAME
+        end
+        # Returns the recon CSV headers; e.g., [:place_as_recorded, :authorized_label, :structured_value, :ds_qid]
+        #
+        # @return [Array<Symbol>] the recon CSV headers
+        def recon_csv_headers
+          self::RECON_CSV_HEADERS
+        end
+        # Returns lookups should pulls from the data dictionaries; e.g., [:authorized_label, :structured_value, :ds_qid]
+        #
+        # @return [Array<Symbol>] the lookup columns
+        def lookup_columns
+          self::LOOKUP_COLUMNS
+        end
+        # Returns the columns used to make the lookup key for the data dictionary; e.g., [:genre_as_recorded, :vocabulary]
+        #
+        # @return [Array<Symbol>] the key columns
+        def key_columns
+          self::KEY_COLUMNS
+        end
+        # Returns the column in the recon CSV that holds the as-recorded value; e.g., :author_as_recorded
+        #
+        # @return [Symbol] the import CSV as recorded column
+        def as_recorded_column
+          self::AS_RECORDED_COLUMN
+        end
+        # Returns the delimiter repalcement map: { ORIGINAL => REPLACEMENT}; e.g., { '|' => ';' }
+        #
+        # @return [Hash<Symbol,String>] the delimiter map
+        def delimiter_map
+          self::DELIMITER_MAP
+        end
+        # Returns the name of the DS::Extractor methods; e.g., [:extract_places]
+        #
+        # @return [Array<Symbol>] the method name
+        def method_name
+          self::METHOD_NAME
+        end
+        # Returns the balanced columns for the current object.
+        #
+        # Balanced columns should have equal numbers of fields and
+        # subfields in each row; e.g., if fields are delimited by '|'
+        # and subfields by ';', then the following are balanced:
+        #
+        # structured_value,authorized_label
+        # a|b;c,d|e;f
+        # 1|2|3,x|y|z
+        # r,s
+        #
+        # @return [Array<Symbol>] The balanced columns.
+        #
+        # @example
+        #   Recon::Type::Materials.balanced_columns #=> [:structured_value, :authorized_label]
+        def balanced_columns
+          self::BALANCED_COLUMNS
+        end
+        ##
+        # Return the values of the key columns in the given row.
+        #
+        # @param row [Hash<Symbol,String>] The row to extract values from.
+        # @return [Array<String>] The values of the key columns in the given row.
+        def get_key_values row
+          key_columns.map { |key| row[key] }
+        end
+        def lookup_values row
+          lookup_columns.map { |key| row[key] }
+        end
+      end
+    end
+  end
+end

data/lib/ds/recon/type/splits.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module Recon
+  module Type
+    class Splits
+      extend DS::Util
+      include ReconType
+      SET_NAME = :splits
+      RECON_CSV_HEADERS = %i{ as_recorded authorized_label }
+      LOOKUP_COLUMNS = %i{ authorized_label }
+      KEY_COLUMNS = %i{ as_recorded }
+      AS_RECORDED_COLUMN = :as_recorded
+      DELIMITER_MAP = {}
+      METHOD_NAME = []
+      BALANCED_COLUMNS = {}
+      def self._lookup_single as_recorded, from_column:
+        key_values = [as_recorded]
+        Recon.lookup_single(:splits, key_values: key_values , column: from_column)
+      end
+    end
+  end
+end

data/lib/ds/recon/type/subjects.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'nokogiri'
+module Recon
+  module Type
+    ##
+    # Extract subjects for reconciliation CSV output.
+    #
+    # NOTE: Each source subject extraction method should return a two dimensional
+    # array:
+    #
+    #     [["Islamic law--Early works to 1800", ""],
+    #       ["Malikites--Early works to 1800", ""],
+    #       ["Islamic law", ""],
+    #       ["Malikites", ""],
+    #       ["Arabic language--Grammar--Early works to 1800", ""],
+    #       ["Arabic language--Grammar", ""],
+    #       ...
+    #       ]
+    #
+    # The two values are `subject_as_recorded` and `source_authority_uri`. The
+    # second of these is present when the source record provides an accompanying
+    # URI. This is rare. Sources the lack a URI should return the as recorded
+    # value and `""` (the empty string) for the `source_authority_uri` as shown
+    # above.
+    #
+    class Subjects
+      extend DS::Util
+      include ReconType
+      SET_NAME = :subjects
+      RECON_CSV_HEADERS = %i{
+      subject_as_recorded
+      subfield_codes
+      vocab
+      source_authority_uri
+      authorized_label
+      structured_value
+      ds_qid
+    }.freeze
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      structured_value
+      ds_qid
+    }
+      KEY_COLUMNS = %i{
+      subject_as_recorded
+      subfield_codes
+      vocab
+    }
+      METHOD_NAME = %i{ extract_subjects }
+      BALANCED_COLUMNS = { subjects: %i{ structured_value authorized_label } }
+      AS_RECORDED_COLUMN = :subject_as_recorded
+      DELIMITER_MAP = { '|' => ';' }
+    end
+  end
+end

data/lib/ds/recon/type/titles.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'nokogiri'
+module Recon
+  module Type
+    class Titles
+      extend DS::Util
+      include ReconType
+      SET_NAME = :titles
+      METHOD_NAME = %i{ extract_titles }
+      RECON_CSV_HEADERS = %i{
+      title_as_recorded
+      title_as_recorded_agr
+      uniform_title_as_recorded
+      uniform_title_as_recorded_agr
+      authorized_label
+      ds_qid
+    }
+      LOOKUP_COLUMNS = %i{
+      authorized_label
+      ds_qid
+    }
+      KEY_COLUMNS = %i{ title_as_recorded uniform_title_as_recorded }
+      AS_RECORDED_COLUMN = :title_as_recorded
+      DELIMITER_MAP = { '|' => ';' }
+      BALANCED_COLUMNS = {}
+    end
+  end
+end

data/lib/ds/recon/url_lookup.rb ADDED Viewed

@@ -0,0 +1,52 @@
+module Recon
+  class URLLookup
+    attr_reader :lookup_set
+    attr_reader :url_hash
+    ##
+    # The name of the lookup set in `config/recon.yml`. For example, for
+    #
+    #     ---
+    #     recon:
+    #       # ...
+    #       iiif_manifests: iiif/legacy-iiif-manifests.csv
+    #
+    #  the +lookup_set+ is 'iiif_manifests'.
+    #
+    # @param [String] lookup_set the name of the recon setting
+    def initialize lookup_set
+      @lookup_set = lookup_set
+      @url_hash = {}
+    end
+    def find_url holding_inst_as_recorded, shelfmark
+      key = url_key holding_inst_as_recorded, shelfmark
+      urls[key]
+    end
+    @url_hash = nil
+    def urls
+      return url_hash unless url_hash.empty?
+      recon_repo = File.join DS.root, 'data', Settings.recon.git_local_name
+      csv_file   = File.join recon_repo, Settings.recon[lookup_set]
+      CSV.readlines(csv_file, headers: true).each { |row|
+        key = url_key row['holding_institution'], row['shelfmark']
+        url_hash[key] = row['url']
+      }
+      url_hash
+    end
+    def url_key holder, shelfmark
+      qid = DS::Institutions.find_qid holder
+      raise DSError, "No QID found for #{holder}" if qid.blank?
+      normalize_key qid, shelfmark
+    end
+    def normalize_key *strings
+      strings.join.downcase.gsub(%r{\s+}, '')
+    end
+  end
+end