RubyGems - dwc-archive - Versions diffs - 0.9.6 → 1.1.1 - Mend

dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

checksums.yaml +7 -0
data/.gitignore +31 -0
data/.rspec +3 -0
data/.rubocop.yml +23 -0
data/.ruby-version +1 -0
data/.travis.yml +4 -5
data/CHANGELOG +15 -7
data/Gemfile +3 -15
data/LICENSE +1 -1
data/README.md +135 -111
data/Rakefile +13 -54
data/dwc-archive.gemspec +37 -0
data/features/step_definitions/dwc-creator_steps.rb +5 -5
data/features/step_definitions/dwc-reader_steps.rb +47 -28
data/features/support/env.rb +1 -1
data/lib/dwc_archive.rb +121 -0
data/lib/dwc_archive/archive.rb +59 -0
data/lib/dwc_archive/classification_normalizer.rb +382 -0
data/lib/dwc_archive/core.rb +25 -0
data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
data/lib/dwc_archive/expander.rb +85 -0
data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
data/lib/dwc_archive/generator.rb +90 -0
data/lib/dwc_archive/generator_eml_xml.rb +116 -0
data/lib/dwc_archive/generator_meta_xml.rb +72 -0
data/lib/dwc_archive/gnub_taxon.rb +14 -0
data/lib/dwc_archive/ingester.rb +106 -0
data/lib/dwc_archive/metadata.rb +56 -0
data/lib/dwc_archive/taxon_normalized.rb +23 -0
data/lib/dwc_archive/version.rb +6 -0
data/lib/dwc_archive/xml_reader.rb +89 -0
data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
data/spec/files/generator_eml.xml +47 -0
data/spec/files/generator_meta.xml +19 -0
data/spec/lib/classification_normalizer_spec.rb +214 -0
data/spec/lib/core_spec.rb +100 -0
data/spec/lib/darwin_core_spec.rb +249 -0
data/spec/lib/generator_eml_xml_spec.rb +22 -0
data/spec/lib/generator_meta_xml_spec.rb +22 -0
data/spec/lib/generator_spec.rb +124 -0
data/spec/lib/gnub_taxon_spec.rb +32 -0
data/spec/lib/metadata_spec.rb +89 -0
data/spec/lib/taxon_normalized_spec.rb +142 -0
data/spec/lib/xml_reader_spec.rb +11 -11
data/spec/spec_helper.rb +78 -6
metadata +180 -92
data/.rvmrc +0 -1
data/Gemfile.lock +0 -155
data/VERSION +0 -1
data/lib/dwc-archive.rb +0 -95
data/lib/dwc-archive/.expander.rb.swo +0 -0
data/lib/dwc-archive/archive.rb +0 -37
data/lib/dwc-archive/classification_normalizer.rb +0 -424
data/lib/dwc-archive/core.rb +0 -17
data/lib/dwc-archive/expander.rb +0 -80
data/lib/dwc-archive/generator.rb +0 -75
data/lib/dwc-archive/generator_eml_xml.rb +0 -84
data/lib/dwc-archive/generator_meta_xml.rb +0 -50
data/lib/dwc-archive/ingester.rb +0 -101
data/lib/dwc-archive/metadata.rb +0 -42
data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
data/lib/dwc-archive/xml_reader.rb +0 -64
data/spec/lib/dwc-archive_spec.rb +0 -250
data/spec/spec.opts +0 -1

data/lib/dwc_archive/core.rb ADDED

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Represents core of the DarwinCore Archive
+  class Core
+    include DarwinCore::Ingester
+    attr_reader :id
+    # rubocop:disable Metrics/MethodLength
+    def initialize(dwc)
+      @dwc = dwc
+      @archive = @dwc.archive
+      @path = @archive.files_path
+      root_key = @archive.meta.keys[0]
+      @data = @archive.meta[root_key][:core]
+      unless @data
+        raise DarwinCore::CoreFileError,
+              "Cannot find core in meta.xml, is meta.xml valid?"
+      end
+      @id = @data[:id][:attributes]
+      init_attributes
+    end
+  end
+  # rubocop:enable Metrics/MethodLength
+end

data/lib/{dwc-archive → dwc_archive}/errors.rb RENAMED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 class DarwinCore
   class Error < RuntimeError; end
   class FileNotFoundError < Error; end

data/lib/dwc_archive/expander.rb ADDED

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Unpacks compressed archives into a temp directory
+  class Expander
+    def initialize(archive_path, tmp_dir)
+      @archive_path = archive_path
+      @tmp_dir = tmp_dir
+      @dir_path = DarwinCore.random_path(tmp_dir)
+      @unpacker = init_unpacker
+    end
+    def unpack
+      clean
+      raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
+      success = @unpacker.call(@dir_path, @archive_path) if @unpacker
+      if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
+        success
+      else
+        clean
+        raise DarwinCore::UnpackingError
+      end
+    end
+    def path
+      @path ||= files_path
+    end
+    def clean
+      DarwinCore.clean(@dir_path)
+    end
+    def files
+      DarwinCore.files(path)
+    end
+    private
+    def init_unpacker
+      return tar_unpacker if @archive_path =~ /tar.gz$/i
+      return zip_unpacker if @archive_path =~ /zip$/i
+      nil
+    end
+    def tar_unpacker
+      proc do |tmp_path, archive_path|
+        FileUtils.mkdir tmp_path
+        path = esc(archive_path)
+        system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
+      end
+    end
+    def zip_unpacker
+      proc do |tmp_path, archive_path|
+        path = esc(archive_path)
+        system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
+      end
+    end
+    def esc(a_str)
+      "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
+    end
+    def path_entries(dir)
+      Dir.entries(dir).reject { |e| e.match(/[\.]{1,2}$/) }.sort
+    end
+    def files_path
+      entries = path_entries(@dir_path)
+      entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
+    end
+    def search_for_file_path(entries)
+      res = nil
+      entries.each do |e|
+        check_path = File.join(@dir_path, e)
+        next unless FileTest.directory?(check_path) &&
+                    path_entries(check_path).include?("meta.xml")
+        res = check_path
+        break
+      end
+      res
+    end
+  end
+end

data/lib/{dwc-archive → dwc_archive}/extension.rb RENAMED

@@ -1,8 +1,11 @@
+# frozen_string_literal: true
 class DarwinCore
+  # Represents extensions of DarwinCore Archive
   class Extension
     include DarwinCore::Ingester
     attr_reader :coreid
-    alias :id :coreid
+    alias id coreid
     def initialize(dwc, data)
       @dwc = dwc
@@ -10,8 +13,7 @@ class DarwinCore
       @path = @archive.files_path
       @data = data
       @coreid = @data[:coreid][:attributes]
-      get_attributes(DarwinCore::ExtensionFileError)
+      init_attributes
     end
   end
 end

data/lib/dwc_archive/generator.rb ADDED

@@ -0,0 +1,90 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Creates csv files for core and extensions
+  class Generator
+    attr_reader :eml_xml_data, :path
+    def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
+      @dwc_path = dwc_path
+      @path = DarwinCore.random_path(tmp_dir)
+      FileUtils.mkdir(@path)
+      @meta_xml_data = { extensions: [] }
+      @eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
+                        citation: nil, url: nil }
+      @write = "w:utf-8"
+    end
+    def clean
+      DarwinCore.clean(@path)
+    end
+    def add_core(data, file_name, keep_headers = true)
+      opts = { type: "core", data: data, file_name: file_name,
+               keep_headers: keep_headers }
+      prepare_csv_file(opts)
+    end
+    def add_extension(data, file_name, keep_headers = true,
+                      row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
+      opts = { type: "extension", data: data, file_name: file_name,
+               keep_headers: keep_headers, row_type: row_type }
+      prepare_csv_file(opts)
+    end
+    def add_meta_xml
+      meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
+      meta.create
+    end
+    def add_eml_xml(data)
+      @eml_xml_data = data
+      eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
+      eml.create
+    end
+    def files
+      DarwinCore.files(@path)
+    end
+    def pack
+      a = "cd #{@path}; tar -zcf #{@dwc_path} *"
+      system(a)
+    end
+    private
+    def prepare_csv_file(opts)
+      c = CSV.open(File.join(@path, opts[:file_name]), @write)
+      attributes = prepare_attributes(opts)
+      if opts[:type] == "core"
+        @meta_xml_data[:core] = attributes
+      else
+        @meta_xml_data[:extensions] << attributes
+      end
+      opts[:data].each { |d| c << d }
+      c.close
+    end
+    def prepare_attributes(opts)
+      header = opts[:data].shift
+      fields = init_fields(header, opts[:type])
+      opts[:data].unshift(fields) if opts[:keep_headers]
+      ignore_header_lines = opts[:keep_headers] ? 1 : 0
+      res = { fields: header, ignoreHeaderLines: ignore_header_lines,
+              location: opts[:file_name] }
+      res[:rowType] = opts[:row_type] if opts[:row_type]
+      res
+    end
+    def init_fields(header, file_type)
+      header.map do |f|
+        f = f.strip
+        err = "No header in #{file_type} data, or header fields are not urls"
+        raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
+        f.split("/")[-1]
+      end
+    end
+  end
+end

data/lib/dwc_archive/generator_eml_xml.rb ADDED

@@ -0,0 +1,116 @@
+class DarwinCore
+  class Generator
+    # Creates EML file with meta information about archive
+    class EmlXml
+      SCHEMA_DATA = {
+        :"xml:lang" =>           "en",
+        :"xmlns:eml" =>          "eml://ecoinformatics.org/eml-2.1.1",
+        :"xmlns:md" =>           "eml://ecoinformatics.org/methods-2.1.1",
+        :"xmlns:proj" =>         "eml://ecoinformatics.org/project-2.1.1",
+        :"xmlns:d" =>            "eml://ecoinformatics.org/dataset-2.1.1",
+        :"xmlns:res" =>          "eml://ecoinformatics.org/resource-2.1.1",
+        :"xmlns:dc" =>           "http://purl.org/dc/terms/",
+        :"xmlns:xsi" =>          "http://www.w3.org/2001/XMLSchema-instance",
+        :"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
+          "http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
+      }
+      def initialize(data, path)
+        @data = data
+        @path = path
+        @write = "w:utf-8"
+      end
+      def create
+        schema_data = {
+          packageId: "#{@data[:id]}/#{timestamp}",
+          system: @data[:system] || "http://globalnames.org"
+        }.merge(SCHEMA_DATA)
+        builder = Nokogiri::XML::Builder.new do |xml|
+          xml.eml(schema_data) do
+            build_body(xml)
+          end
+        end
+        save_eml(builder)
+      end
+      private
+      def build_body(xml)
+        build_dataset(xml)
+        build_additional_metadata(xml)
+        xml.parent.namespace = xml.parent.namespace_definitions.first
+      end
+      def save_eml(builder)
+        data = builder.to_xml
+        f = open(File.join(@path, "eml.xml"), @write)
+        f.write(data)
+        f.close
+      end
+      def build_dataset(xml)
+        xml.dataset(id: @data[:id]) do
+          xml.title(@data[:title])
+          xml.license(@data[:license])
+          contacts = []
+          build_authors(xml, contacts)
+          build_metadata_providers(xml)
+          xml.pubDate(Time.now.to_s)
+          build_abstract(xml)
+          build_contacts(xml, contacts)
+        end
+      end
+      def build_abstract(xml)
+        xml.abstract { xml.para(@data[:abstract]) }
+      end
+      def build_contacts(xml, contacts)
+        contacts.each { |contact| xml.contact { xml.references(contact) } }
+      end
+      def build_metadata_providers(xml)
+        @data[:metadata_providers].each do |a|
+          xml.metadataProvider { build_person(xml, a) }
+        end if @data[:metadata_providers]
+      end
+      def build_authors(xml, contacts)
+        @data[:authors].each_with_index do |a, i|
+          creator_id = i + 1
+          contacts << creator_id
+          xml.creator(id: creator_id, scope: "document") do
+            build_person(xml, a)
+          end
+        end
+      end
+      def build_additional_metadata(xml)
+        xml.additionalMetadata do
+          xml.metadata do
+            xml.citation(@data[:citation])
+            xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
+          end
+        end
+      end
+      def build_person(xml, data)
+        a = data
+        xml.individualName do
+          xml.givenName(a[:first_name])
+          xml.surName(a[:last_name])
+        end
+        xml.organizationName(a[:organization]) if a[:organization]
+        xml.positionName(a[:position]) if a[:position]
+        xml.onlineUrl(a[:url]) if a[:url]
+        xml.electronicMailAddress(a[:email])
+      end
+      def timestamp
+        t = Time.now.getutc.to_a[0..5].reverse
+        t[0..2] * ("-") + "::" + t[-3..-1] * (":")
+      end
+    end
+  end
+end

data/lib/dwc_archive/generator_meta_xml.rb ADDED

@@ -0,0 +1,72 @@
+class DarwinCore
+  class Generator
+    # Creates DarwinCore meta file
+    class MetaXml
+      def initialize(data, path)
+        @data = data
+        @path = path
+        @write = "w:utf-8"
+      end
+      def create
+        schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
+          "http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
+        builder = Nokogiri::XML::Builder.new do |xml|
+          opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
+                   fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
+                   rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
+          build_archive(xml, opts, schema_uri)
+        end
+        save_meta(builder)
+      end
+      private
+      def save_meta(builder)
+        meta_xml_data = builder.to_xml
+        meta_file = open(File.join(@path, "meta.xml"), @write)
+        meta_file.write(meta_xml_data)
+        meta_file.close
+      end
+      def build_archive(xml, opts, schema_uri)
+        xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
+                    :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
+                    :"xsi:schemaLocation" => schema_uri) do
+          build_core(xml, opts)
+          build_extensions(xml, opts)
+        end
+      end
+      def build_core(xml, opts)
+        xml.core(opts.merge(ignoreHeaderLines:
+                              @data[:core][:ignoreHeaderLines])) do
+          xml.files { xml.location(@data[:core][:location]) }
+          taxon_id, fields = find_taxon_id(@data[:core][:fields])
+          xml.id_(index: taxon_id[1])
+          fields.each { |f| xml.field(term: f[0], index: f[1]) }
+        end
+      end
+      def build_extensions(xml, opts)
+        @data[:extensions].each do |e|
+          xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
+                                   rowType: e[:rowType])) do
+            xml.files { xml.location(e[:location]) }
+            taxon_id, fields = find_taxon_id(e[:fields])
+            xml.coreid(index: taxon_id[1])
+            fields.each { |f| xml.field(term: f[0], index: f[1]) }
+          end
+        end
+      end
+      def find_taxon_id(data)
+        fields = []
+        data.each_with_index { |f, i| fields << [f.strip, i] }
+        taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
+        fail DarwinCore::GeneratorError if taxon_id.size != 1
+        [taxon_id[0], fields]
+      end
+    end
+  end
+end

data/lib/dwc_archive/gnub_taxon.rb ADDED

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Covers special case of Global Names Usage Bank data
+  class GnubTaxon < TaxonNormalized
+    attr_accessor :uuid, :uuid_path
+    def initialize
+      super
+      @uuid = nil
+      @uuid_path = []
+    end
+  end
+end

data/lib/dwc_archive/ingester.rb ADDED

@@ -0,0 +1,106 @@
+# encoding: utf-8
+class DarwinCore
+  # This module abstracts information for reading csv file to be used
+  # in several classes which need such functionality
+  module Ingester
+    attr_reader :data, :properties, :encoding, :fields_separator, :size
+    attr_reader :file_path, :fields, :line_separator, :quote_character,
+                :ignore_headers
+    def size
+      @size ||= init_size
+    end
+    def read(batch_size = 10_000)
+      DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
+      res = []
+      errors = []
+      args = define_csv_args
+      min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1
+      csv = CSV.new(open(@file_path), args)
+      csv.each_with_index do |r, i|
+        next if @ignore_headers && i == 0
+        min_size > r.size ? errors << r : process_csv_row(res, errors, r)
+        next if i == 0 || i % batch_size != 0
+        DarwinCore.logger_write(@dwc.object_id,
+                                format("Ingested %s records from %s",
+                                       i, name))
+        next unless block_given?
+        yield [res, errors]
+        res = []
+        errors = []
+      end
+      yield [res, errors] if block_given?
+      [res, errors]
+    end
+    private
+    def define_csv_args
+      args = { col_sep: @field_separator }
+      @quote_character = "\b" if @quote_character.empty?
+      args.merge(quote_char: @quote_character)
+    end
+    def name
+      self.class.to_s.split("::")[-1].downcase
+    end
+    def process_csv_row(result, errors, row)
+      str = row.join("")
+      str = str.force_encoding("utf-8")
+      if str.encoding.name == "UTF-8" && str.valid_encoding?
+        result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
+      else
+        errors << row
+      end
+    end
+    def init_attributes
+      @properties = @data[:attributes]
+      init_encoding
+      @field_separator = init_field_separator
+      @quote_character = @properties[:fieldsEnclosedBy] || ""
+      @line_separator = @properties[:linesTerminatedBy] || "\n"
+      @ignore_headers = @properties[:ignoreHeaderLines] &&
+                        [1, true].include?(@properties[:ignoreHeaderLines])
+      init_file_path
+      init_fields
+    end
+    def init_encoding
+      @encoding = @properties[:encoding] || "UTF-8"
+      accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"].
+                          include?(@encoding.downcase)
+      fail(
+        DarwinCore::EncodingError,
+        "No support for encodings other than utf-8 or utf-16 at the moment"
+      ) unless accepted_encoding
+    end
+    def init_file_path
+      file = @data[:location] ||
+             @data[:attributes][:location] ||
+             @data[:files][:location]
+      @file_path = File.join(@path, file)
+      fail DarwinCore::FileNotFoundError, "No file data" unless @file_path
+    end
+    def init_fields
+      @data[:field] = [data[:field]] if data[:field].class != Array
+      @fields = @data[:field].map { |f| f[:attributes] }
+      fail DarwinCore::InvalidArchiveError,
+           "No data fields are found" if @fields.empty?
+    end
+    def init_field_separator
+      res = @properties[:fieldsTerminatedBy] || ","
+      res = "\t" if res == "\\t"
+      res
+    end
+    def init_size
+      `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
+    end
+  end
+end