RubyGems - dwc-archive - Versions diffs - 0.9.11 → 1.0.1 - Mend

dwc-archive 0.9.11 → 1.0.1

Files changed (53) hide show

checksums.yaml +5 -5
data/.rspec +2 -1
data/.rubocop.yml +23 -0
data/.ruby-version +1 -1
data/.travis.yml +2 -3
data/CHANGELOG +2 -0
data/Gemfile +3 -1
data/README.md +110 -106
data/Rakefile +13 -36
data/dwc-archive.gemspec +24 -19
data/features/step_definitions/dwc-creator_steps.rb +5 -5
data/features/step_definitions/dwc-reader_steps.rb +47 -28
data/features/support/env.rb +1 -1
data/lib/dwc_archive.rb +121 -0
data/lib/dwc_archive/archive.rb +59 -0
data/lib/dwc_archive/classification_normalizer.rb +392 -0
data/lib/dwc_archive/core.rb +25 -0
data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
data/lib/dwc_archive/expander.rb +88 -0
data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
data/lib/dwc_archive/generator.rb +90 -0
data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
data/lib/dwc_archive/gnub_taxon.rb +14 -0
data/lib/dwc_archive/ingester.rb +106 -0
data/lib/dwc_archive/metadata.rb +56 -0
data/lib/dwc_archive/taxon_normalized.rb +23 -0
data/lib/dwc_archive/version.rb +6 -0
data/lib/dwc_archive/xml_reader.rb +89 -0
data/spec/files/generator_eml.xml +1 -1
data/spec/lib/classification_normalizer_spec.rb +96 -105
data/spec/lib/core_spec.rb +43 -41
data/spec/lib/darwin_core_spec.rb +111 -132
data/spec/lib/generator_eml_xml_spec.rb +12 -11
data/spec/lib/generator_meta_xml_spec.rb +12 -11
data/spec/lib/generator_spec.rb +73 -74
data/spec/lib/gnub_taxon_spec.rb +14 -16
data/spec/lib/metadata_spec.rb +50 -41
data/spec/lib/taxon_normalized_spec.rb +62 -65
data/spec/lib/xml_reader_spec.rb +9 -12
data/spec/spec_helper.rb +55 -49
metadata +92 -77
data/.rvmrc +0 -1
data/lib/dwc-archive.rb +0 -107
data/lib/dwc-archive/archive.rb +0 -40
data/lib/dwc-archive/classification_normalizer.rb +0 -427
data/lib/dwc-archive/core.rb +0 -19
data/lib/dwc-archive/expander.rb +0 -85
data/lib/dwc-archive/generator.rb +0 -86
data/lib/dwc-archive/ingester.rb +0 -101
data/lib/dwc-archive/metadata.rb +0 -48
data/lib/dwc-archive/version.rb +0 -3
data/lib/dwc-archive/xml_reader.rb +0 -80

@@ -1,19 +0,0 @@
-class DarwinCore
-  class Core
-    include DarwinCore::Ingester
-    attr_reader :id
-    def initialize(dwc)
-      @dwc = dwc
-      @archive = @dwc.archive
-      @path = @archive.files_path
-      root_key = @archive.meta.keys[0]
-      @data = @archive.meta[root_key][:core]
-      raise DarwinCore::CoreFileError.
-        new('Cannot find core in meta.xml, is meta.xml valid?') unless @data
-      @id = @data[:id][:attributes]
-      get_attributes(DarwinCore::CoreFileError)
-    end
-  end
-end

data/lib/dwc-archive/expander.rb DELETED

@@ -1,85 +0,0 @@
-class DarwinCore
-  class Expander
-    def initialize(archive_path, tmp_dir)
-      @archive_path = archive_path
-      @tmp_dir = tmp_dir
-      @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
-      @unpacker = get_unpacker
-    end
-    def unpack
-      clean
-      raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
-      success = @unpacker.call(@path, @archive_path) if @unpacker
-      (@unpacker && success && $?.exitstatus == 0) ?
-        success :
-        (clean; raise DarwinCore::UnpackingError)
-    end
-    def path
-      @files_path ||= files_path
-    end
-    def clean
-      FileUtils.rm_rf(@path) if FileTest.exists?(@path)
-    end
-    def files
-      return nil unless path && FileTest.exists?(path)
-      Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
-    end
-    private
-    def esc(a_str)
-      "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
-    end
-    def get_unpacker
-      file_command = IO.popen("file -z " + esc(@archive_path))
-      file_type    = file_command.read
-      file_command.close
-      if file_type.match(/tar.*gzip/i)
-        return proc do |tmp_path, archive_path|
-          FileUtils.mkdir tmp_path
-          path = esc(archive_path)
-          system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
-        end
-      end
-      if file_type.match(/Zip/)
-        return proc do |tmp_path, archive_path|
-          path = esc(archive_path)
-          system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
-        end
-      end
-      return nil
-    end
-    def path_entries(dir)
-      Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
-    end
-    def files_path
-      res = nil
-      entries = path_entries(@path)
-      if entries.include?('meta.xml')
-        res = @path
-      else
-        entries.each do |e|
-          check_path = File.join(@path, e)
-          if FileTest.directory?(check_path)
-            if path_entries(check_path).include?('meta.xml')
-              res = check_path
-              break
-            end
-          end
-        end
-      end
-      res
-    end
-  end
-end

data/lib/dwc-archive/generator.rb DELETED

@@ -1,86 +0,0 @@
-class DarwinCore
-  class Generator
-    attr_reader :eml_xml_data
-    #TODO refactor -- for now copying expander methods
-    def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
-      @dwc_path = dwc_path
-      @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
-      FileUtils.mkdir(@path)
-      @meta_xml_data = {:extensions => []}
-      @eml_xml_data = {:id => nil, :title => nil,
-        :authors => [], :abstract => nil, :citation => nil, :url => nil}
-      @write = 'w:utf-8'
-    end
-    #TODO refactor!
-    def clean
-      FileUtils.rm_rf(@path) if FileTest.exists?(@path)
-    end
-    def add_core(data, file_name, keep_headers = true)
-      c = CSV.open(File.join(@path,file_name), @write)
-      header = data.shift
-      fields = get_fields(header, 'core')
-      data.unshift(fields) if keep_headers
-      ignore_header_lines = keep_headers ? 1 : 0
-      @meta_xml_data[:core] = { fields: header,
-                                ignoreHeaderLines: ignore_header_lines,
-                                location:file_name }
-      data.each {|d| c << d}
-      c.close
-    end
-    def add_extension(data, file_name,
-                      keep_headers = true,
-                      row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
-      c = CSV.open(File.join(@path,file_name), @write)
-      header = data.shift
-      fields = get_fields(header, 'extension')
-      data.unshift(fields) if keep_headers
-      ignore_header_lines = keep_headers ? 1 : 0
-      @meta_xml_data[:extensions] << { fields: header,
-                                       ignoreHeaderLines: ignore_header_lines,
-                                       location: file_name,
-                                       rowType: row_type }
-      data.each { |d| c << d }
-      c.close
-    end
-    def add_meta_xml
-      meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
-      meta.create
-    end
-    def add_eml_xml(data)
-      @eml_xml_data = data
-      eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
-      eml.create
-    end
-    def path
-      @path
-    end
-    def files
-      return nil unless @path && FileTest.exists?(@path)
-      Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
-    end
-    def pack
-      a = "cd #{@path}; tar -zcf #{@dwc_path} *"
-      system(a)
-    end
-    private
-    def get_fields(header, file_type)
-      header.map do |f|
-        f.strip!
-        err = "No header in %s data, or header fields are not urls" % file_type
-        raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
-        f.split('/')[-1]
-      end
-    end
-  end
-end

data/lib/dwc-archive/ingester.rb DELETED

@@ -1,101 +0,0 @@
-# encoding: utf-8
-class DarwinCore
-  module Ingester
-    attr_reader :data, :properties, :encoding, :fields_separator, :size
-    attr_reader :file_path, :fields, :line_separator,
-                :quote_character, :ignore_headers
-    def size
-      @size ||= get_size
-    end
-    def read(batch_size = 10000)
-      DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
-      res = []
-      errors = []
-      index_fix = 1
-      args = {:col_sep => @field_separator}
-      @quote_character = "\b" if @quote_character.empty?
-      args.merge!({:quote_char => @quote_character})
-      min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
-      csv = CSV.new(open(@file_path), args)
-      csv.each_with_index do |r, i|
-        index_fix = 0; next if @ignore_headers && i == 0
-        min_size > r.size ? errors << r : process_csv_row(res, errors, r)
-        if (i + index_fix) % batch_size == 0
-          DarwinCore.logger_write(@dwc.object_id,
-                                  "Ingested %s records from %s" %
-                                  [(i + index_fix), name])
-          if block_given?
-            yield [res, errors]
-            res = []
-            errors = []
-          end
-        end
-      end
-      yield [res, errors] if block_given?
-      [res, errors]
-    end
-    private
-    def name
-      self.class.to_s.split('::')[-1].downcase
-    end
-    def process_csv_row(result, errors, row)
-      str = row.join('')
-      str = str.force_encoding('utf-8')
-      if str.encoding.name == 'UTF-8' && str.valid_encoding?
-        result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
-      else
-        errors << row
-      end
-    end
-    def get_attributes(exception)
-      @properties = @data[:attributes]
-      @encoding = @properties[:encoding] || 'UTF-8'
-      err_msg = 'No support for encodings other ' +
-        'than utf-8 or utf-16 at the moment'
-      encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
-      unless encodings.include? @encoding.downcase
-        raise DarwinCore::EncodingError.new(err_msg)
-      end
-      @field_separator = get_field_separator
-      @quote_character = @properties[:fieldsEnclosedBy] || ''
-      @line_separator = @properties[:linesTerminatedBy] || '\n'
-      @ignore_headers = @properties[:ignoreHeaderLines] ?
-                        [1, true].include?(@properties[:ignoreHeaderLines]) :
-                        false
-      @file_path = get_file_path
-      raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
-      @fields = get_fields
-      if @fields.empty?
-        raise DarwinCore::InvalidArchiveError.new("No data fields are found")
-      end
-    end
-    def get_file_path
-      file = @data[:location] ||
-             @data[:attributes][:location] ||
-             @data[:files][:location]
-      File.join(@path, file)
-    end
-    def get_fields
-      @data[:field] = [data[:field]] if data[:field].class != Array
-      @data[:field].map {|f| f[:attributes]}
-    end
-    def get_field_separator
-      res = @properties[:fieldsTerminatedBy] || ','
-      res = "\t" if res == "\\t"
-      res
-    end
-    def get_size
-      `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
-    end
-  end
-end

data/lib/dwc-archive/metadata.rb DELETED

@@ -1,48 +0,0 @@
-class DarwinCore
-  class Metadata
-    def initialize(archive = nil)
-      @archive = archive
-      @metadata = @archive.eml
-    end
-    def data
-      @metadata
-    end
-    def id
-      @metadata[:eml][:dataset][:attributes][:id] rescue nil
-    end
-    def package_id
-      @metadata.data[:eml][:attributes][:packageId] rescue nil
-    end
-    def title
-      @metadata[:eml][:dataset][:title] rescue nil
-    end
-    def authors
-      return nil unless defined?(@metadata[:eml][:dataset][:creator])
-      @metadata[:eml][:dataset][:creator] =
-        [@metadata[:eml][:dataset][:creator]] unless
-          @metadata[:eml][:dataset][:creator].class == Array
-      @metadata[:eml][:dataset][:creator].map do |c|
-        { first_name: c[:individualName][:givenName],
-          last_name: c[:individualName][:surName],
-          email: c[:electronicMailAddress] }
-      end
-    end
-    def abstract
-      @metadata[:eml][:dataset][:abstract] rescue nil
-    end
-    def citation
-      @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
-    end
-    def url
-      @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
-    end
-  end
-end

data/lib/dwc-archive/version.rb DELETED

@@ -1,3 +0,0 @@
-class DarwinCore
-  VERSION = "0.9.11"
-end

data/lib/dwc-archive/xml_reader.rb DELETED

@@ -1,80 +0,0 @@
-# USAGE: Hash.from_xml:(YOUR_XML_STRING)
-# modified from
-# http://stackoverflow.com/questions/1230741/
-# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
-class DarwinCore
-  module XmlReader
-    class << self
-      def from_xml(xml_io)
-        result = Nokogiri::XML(xml_io)
-        return { result.root.name.to_sym => xml_node_to_hash(result.root)}
-      end
-      private
-      def xml_node_to_hash(node)
-        # If we are at the root of the document, start the hash
-        if node.element?
-          prepare_node_element(node)
-        else
-          return prepare(node.content.to_s)
-        end
-      end
-      def add_attributes(node, result_hash)
-        if node.attributes != {}
-          result_hash[:attributes] = {}
-          node.attributes.keys.each do |key|
-            result_hash[:attributes][node.attributes[key].name.to_sym] =
-              prepare(node.attributes[key].value)
-          end
-        end
-      end
-      def prepare_node_element(node)
-        result_hash = {}
-        add_attributes(node, result_hash)
-        if node.children.size > 0
-          result_hash = add_children(node, result_hash)
-        end
-        result_hash
-      end
-      def add_children(node, result_hash)
-        node.children.each do |child|
-          result = xml_node_to_hash(child)
-          if child.name == "text"
-            text = handle_text(child, result)
-            return text if text
-          elsif result_hash[child.name.to_sym]
-            handle_child_node(child, result_hash, result)
-          else
-            result_hash[child.name.to_sym] = prepare(result)
-          end
-        end
-        result_hash
-      end
-      def handle_child_node(child, result_hash, result)
-        if result_hash[child.name.to_sym].is_a?(Object::Array)
-          result_hash[child.name.to_sym] << prepare(result)
-        else
-          result_hash[child.name.to_sym] =
-            [result_hash[child.name.to_sym]] << prepare(result)
-        end
-      end
-      def handle_text(child, result)
-        unless child.next_sibling || child.previous_sibling
-          prepare(result)
-        end
-      end
-      def prepare(data)
-        (data.class == String && data.to_i.to_s == data) ? data.to_i : data
-      end
-    end
-  end
-end