RubyGems - dwc-archive - Versions diffs - 0.9.6 → 0.9.10 - Mend

dwc-archive 0.9.6 → 0.9.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/.gitignore +30 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/.travis.yml +6 -4
data/CHANGELOG +2 -0
data/Gemfile +1 -15
data/README.md +17 -5
data/Rakefile +6 -24
data/] +40 -0
data/dwc-archive.gemspec +33 -0
data/lib/dwc-archive.rb +33 -21
data/lib/dwc-archive/archive.rb +5 -2
data/lib/dwc-archive/classification_normalizer.rb +4 -0
data/lib/dwc-archive/core.rb +2 -2
data/lib/dwc-archive/expander.rb +6 -2
data/lib/dwc-archive/generator.rb +18 -8
data/lib/dwc-archive/generator_eml_xml.rb +16 -14
data/lib/dwc-archive/generator_meta_xml.rb +19 -11
data/lib/dwc-archive/ingester.rb +1 -1
data/lib/dwc-archive/metadata.rb +8 -2
data/lib/dwc-archive/version.rb +3 -0
data/lib/dwc-archive/xml_reader.rb +9 -9
data/spec/lib/classification_normalizer_spec.rb +223 -0
data/spec/lib/core_spec.rb +98 -0
data/spec/lib/darwin_core_spec.rb +279 -0
data/spec/lib/generator_eml_xml_spec.rb +21 -0
data/spec/lib/generator_meta_xml_spec.rb +21 -0
data/spec/lib/generator_spec.rb +116 -0
data/spec/lib/gnub_taxon_spec.rb +34 -0
data/spec/lib/metadata_spec.rb +80 -0
data/spec/lib/taxon_normalized_spec.rb +145 -0
data/spec/lib/xml_reader_spec.rb +13 -10
data/spec/spec_helper.rb +72 -3
metadata +133 -62
data/Gemfile.lock +0 -155
data/VERSION +0 -1
data/lib/dwc-archive/.expander.rb.swo +0 -0
data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
data/spec/lib/dwc-archive_spec.rb +0 -250
data/spec/spec.opts +0 -1

data/lib/dwc-archive/generator_meta_xml.rb CHANGED Viewed

@@ -8,23 +8,31 @@ class DarwinCore
       end
       def create
+        schema_uri =  'http://rs.tdwg.org/dwc/terms/xsd/archive/' +
+          ' http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd'
         builder = Nokogiri::XML::Builder.new do |xml|
-          opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/terms/Taxon" }
-          xml.archive(:xmlns => "http://rs.tdwg.org/dwc/text/",
-            "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
-            "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd") do
-            xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
+          opts = { encoding: 'UTF-8',
+                   fieldsTerminatedBy: ',',
+                   fieldsEnclosedBy: '"',
+                   linesTerminatedBy: "\n",
+                   rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
+          xml.archive(xmlns: 'http://rs.tdwg.org/dwc/text/',
+            :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
+            :'xsi:schemaLocation' => schema_uri) do
+            xml.core(opts.merge(ignoreHeaderLines:
+                                  @data[:core][:ignoreHeaderLines])) do
               xml.files { xml.location(@data[:core][:location]) }
               taxon_id, fields = find_taxon_id(@data[:core][:fields])
-              xml.id_(:index => taxon_id[1])
-              fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
+              xml.id_(index: taxon_id[1])
+              fields.each { |f| xml.field(term: f[0], index: f[1]) }
             end
             @data[:extensions].each do |e|
-              xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines], :rowType => e[:rowType])) do
+              xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
+                                       rowType: e[:rowType])) do
                 xml.files { xml.location(e[:location]) }
                 taxon_id, fields = find_taxon_id(e[:fields])
-                xml.coreid(:index => taxon_id[1])
-                fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
+                xml.coreid(index: taxon_id[1])
+                fields.each { |f| xml.field(term: f[0], index: f[1]) }
               end
             end
           end
@@ -39,7 +47,7 @@ class DarwinCore
       def find_taxon_id(data)
         fields = []
         data.each_with_index { |f, i| fields << [f.strip, i] }
-        taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
+        taxon_id, fields = fields.partition { |f| f[0].match(%r|/taxonid$|i) }
         raise DarwinCore::GeneratorError if taxon_id.size != 1
         [taxon_id[0], fields]
       end

data/lib/dwc-archive/ingester.rb CHANGED Viewed

@@ -62,7 +62,7 @@ class DarwinCore
         raise DarwinCore::EncodingError.new(err_msg)
       end
       @field_separator = get_field_separator
-      @quote_character = @properties[:fieldsEnclosedBy] || ""
+      @quote_character = @properties[:fieldsEnclosedBy] || ''
       @line_separator = @properties[:linesTerminatedBy] || '\n'
       @ignore_headers = @properties[:ignoreHeaderLines] ?
                         [1, true].include?(@properties[:ignoreHeaderLines]) :

data/lib/dwc-archive/metadata.rb CHANGED Viewed

@@ -23,8 +23,14 @@ class DarwinCore
     def authors
       return nil unless defined?(@metadata[:eml][:dataset][:creator])
-      @metadata[:eml][:dataset][:creator] = [@metadata[:eml][:dataset][:creator]] unless @metadata[:eml][:dataset][:creator].class == Array
-      @metadata[:eml][:dataset][:creator].map {|c| {:first_name => c[:individualName][:givenName], :last_name => c[:individualName][:surName], :email => c[:electronicMailAddress]}}
+      @metadata[:eml][:dataset][:creator] =
+        [@metadata[:eml][:dataset][:creator]] unless
+          @metadata[:eml][:dataset][:creator].class == Array
+      @metadata[:eml][:dataset][:creator].map do |c|
+        { first_name: c[:individualName][:givenName],
+          last_name: c[:individualName][:surName],
+          email: c[:electronicMailAddress] }
+      end
     end
     def abstract

data/lib/dwc-archive/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class DarwinCore
+  VERSION = "0.9.10"
+end

data/lib/dwc-archive/xml_reader.rb CHANGED Viewed

@@ -1,16 +1,14 @@
 # USAGE: Hash.from_xml:(YOUR_XML_STRING)
 require 'nokogiri'
-# modified from http://stackoverflow.com/questions/1230741/convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
+# modified from
+# http://stackoverflow.com/questions/1230741/
+# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
 class DarwinCore
   module XmlReader
     class << self
       def from_xml(xml_io)
-        begin
-          result = Nokogiri::XML(xml_io)
-          return { result.root.name.to_sym => xml_node_to_hash(result.root)}
-        rescue Exception => e
-          raise e
-        end
+        result = Nokogiri::XML(xml_io)
+        return { result.root.name.to_sym => xml_node_to_hash(result.root)}
       end
       private
@@ -22,7 +20,8 @@ class DarwinCore
           if node.attributes != {}
             result_hash[:attributes] = {}
             node.attributes.keys.each do |key|
-              result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
+              result_hash[:attributes][node.attributes[key].
+                name.to_sym] = prepare(node.attributes[key].value)
             end
           end
           if node.children.size > 0
@@ -37,7 +36,8 @@ class DarwinCore
                 if result_hash[child.name.to_sym].is_a?(Object::Array)
                   result_hash[child.name.to_sym] << prepare(result)
                 else
-                  result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
+                  result_hash[child.name.to_sym] =
+                    [result_hash[child.name.to_sym]] << prepare(result)
                 end
               else
                 result_hash[child.name.to_sym] = prepare(result)

data/spec/lib/classification_normalizer_spec.rb ADDED Viewed

@@ -0,0 +1,223 @@
+require_relative '../spec_helper'
+# encoding: utf-8
+describe DarwinCore::ClassificationNormalizer do
+  subject(:dwca) { DarwinCore.new(file_path) }
+  subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
+  let(:file_dir) { File.expand_path('../../files', __FILE__) }
+  let(:file_path) { File.join(file_dir, file_name) }
+  describe '.new' do
+    let(:file_path) { File.join(file_dir, 'data.tar.gz') }
+    it { expect(normalizer.is_a? DarwinCore::ClassificationNormalizer).
+      to be_true }
+  end
+  describe '#normalize' do
+    let(:file_name) { 'data.tar.gz' }
+    it 'returns normalized data' do
+      res = normalizer.normalize
+      expect(res).to be normalizer.normalized_data
+    end
+    context 'flat list' do
+      let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
+      it 'returns flat list' do
+        normalizer.normalize
+        expect(normalizer.normalized_data).to be_kind_of Hash
+        expect(normalizer.normalized_data.size).to be > 0
+      end
+    end
+    context 'synonyms from core' do
+      let(:file_name) { 'synonyms_in_core_accepted_name_field.tar.gz' }
+      it 'ingests synonyms using accepted_name field' do
+        res = normalizer.normalize
+        syn = res.select { |k,v| !v.synonyms.empty? }.
+          map { |k,v| v }
+        expect(syn.size).to be > 0
+        expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
+      end
+    end
+    context 'synonyms from extension' do
+      let(:file_name) { 'synonyms_in_extension.tar.gz' }
+      it 'ingests synonyms from extension' do
+        res = normalizer.normalize
+        syn = res.select { |k,v| !v.synonyms.empty? }.
+          map { |k,v| v }
+        expect(syn.size).to be > 0
+        expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
+      end
+    end
+    context 'synonyms are not extensions' do
+      let(:file_name) { 'not_synonym_in_extension.tar.gz' }
+      it 'does not ingest synonyms' do
+        res = normalizer.normalize
+        syn = res.select { |k,v| !v.synonyms.empty? }.
+          map { |k,v| v }
+        expect(syn).to be_empty
+      end
+    end
+    context 'with_extensions flag set on false' do
+      let(:file_name) { 'synonyms_in_extension.tar.gz' }
+      it 'should not harvest extensions' do
+        res = normalizer.normalize(with_extensions: false)
+        syn = res.select { |k,v| !v.synonyms.empty? }.
+          map { |k,v| v }
+        expect(syn).to be_empty
+      end
+    end
+    context 'linnean classification in file (class, order etc fields)' do
+      let(:file_name) { 'linnean.tar.gz' }
+      it 'assembles classification' do
+        res = normalizer.normalize
+        expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
+        expect(res.first[1].linnean_classification_path).
+          to eq [["Animalia", :kingdom],
+                 ["Arthropoda", :phylum],
+                 ["Insecta", :class],
+                 ["Diptera", :order],
+                 ["Cecidomyiidae", :family],
+                 ["Resseliella", :genus]]
+      end
+    end
+    context 'no linnean fields are given' do
+      it 'returns empty linnean classification' do
+        res = normalizer.normalize
+        expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
+        expect(res.first[1].linnean_classification_path).to be_empty
+      end
+    end
+    context 'in the presence of scientificNameAuthorship field' do
+      let(:file_name) { 'sci_name_authorship.tar.gz' }
+      it 'returns normalized data' do
+        normalizer.normalize
+        expect(normalizer.darwin_core.file_name).
+          to eq 'sci_name_authorship.tar.gz'
+        expect(normalizer.normalized_data).to be_kind_of Hash
+        expect(normalizer.normalized_data.size).to be > 0
+        tn = normalizer.normalized_data['leptogastrinae:tid:2688']
+        expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
+        expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
+      end
+    end
+    context 'when scientificNameAuthorship duplicates author info' do
+      let(:file_name) { 'sci_name_authorship_dup.tar.gz' }
+      it 'returns normalized data' do
+        normalizer.normalize
+        expect(normalizer.darwin_core.file_name).
+          to eq 'sci_name_authorship_dup.tar.gz'
+        expect(normalizer.normalized_data).to be_kind_of Hash
+        expect(normalizer.normalized_data.size).to be > 0
+        tn = normalizer.normalized_data['leptogastrinae:tid:2688']
+        expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
+        expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
+      end
+    end
+    context 'coreid is empty' do
+      let(:file_name) { 'empty_coreid.tar.gz' }
+      it 'should ingest information' do
+        res = normalizer.normalize
+        expect(normalizer.darwin_core.file_name).
+          to eq 'empty_coreid.tar.gz'
+        tn = res['Taxon9']
+        expect(tn.current_name).to eq 'Amanita phalloides'
+      end
+    end
+    context 'vernacular locality info' do
+      let(:file_name) { 'language_locality.tar.gz' }
+      it 'should ingest locality and language' do
+        res = normalizer.normalize
+        tn = res['leptogastrinae:tid:42']
+        vn = tn.vernacular_names[0]
+        expect(vn.language).to eq 'en'
+        expect(vn.locality).to eq 'New England'
+      end
+    end
+  end
+  describe '#name_strings' do
+    let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
+    context 'before running #normalize' do
+      it 'is empty' do
+        expect(normalizer.name_strings).to be_empty
+      end
+    end
+    context 'after running #normalize' do
+      let(:normalized) { normalizer.tap { |n| n.normalize } }
+      context 'default attibutes' do
+        it 'returns array' do
+          expect(normalized.name_strings).to be_kind_of Array
+          expect(normalized.name_strings.size).to be > 1
+        end
+      end
+      context 'with_hash attribute' do
+        it 'returns hash' do
+          strings = normalized.name_strings(with_hash:true)
+          expect(strings).to be_kind_of Hash
+          expect(strings.size).to be > 1
+          expect(strings.values.uniq).to eq [1]
+        end
+      end
+    end
+  end
+  describe '#vernacular_name_strings' do
+    let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
+    context 'before running #normalize' do
+      subject(:vern) { normalizer.vernacular_name_strings }
+      it 'is empty' do
+        expect(vern).to be_empty
+      end
+    end
+    context 'after running #normalize' do
+      let(:normalized) { normalizer.tap { |n| n.normalize } }
+      subject(:vern) { normalized.vernacular_name_strings }
+      subject(:vern_w_hash) { normalized.
+        vernacular_name_strings(with_hash: true) }
+      context 'default attibutes' do
+        it 'returns array' do
+          expect(vern).to be_kind_of Array
+          expect(vern.size).to be > 0
+        end
+      end
+      context 'with_hash attribute' do
+        it 'returns hash' do
+          expect(vern_w_hash).to be_kind_of Hash
+          expect(vern_w_hash.size).to be > 0
+          expect(vern_w_hash.values.uniq).to eq [1]
+        end
+      end
+    end
+  end
+end

data/spec/lib/core_spec.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require_relative '../spec_helper'
+describe DarwinCore::Core do
+  subject(:dwca) { DarwinCore.new(file_path) }
+  subject(:core) { DarwinCore::Core.new(dwca) }
+  let(:file_path) { File.join(File.expand_path('../../files', __FILE__),
+                             file_name) }
+  let(:file_name) { 'data.tar.gz' }
+  describe '.new' do
+    it 'creates new core' do
+      expect(core).to be_kind_of DarwinCore::Core
+    end
+  end
+  describe '#id' do
+    it 'returns core id' do
+      expect(core.id[:index]).to eq 0
+      expect(core.id[:term]).to eq 'http://rs.tdwg.org/dwc/terms/TaxonID'
+    end
+    context 'no coreid' do
+      let(:file_name) { 'empty_coreid.tar.gz' }
+      it 'does not return coreid' do
+        expect(core.id[:index]).to eq 0
+        expect(core.id[:term]).to be_nil
+      end
+    end
+  end
+  describe '#data' do
+    it 'gers core data' do
+      expect(core.data).to be_kind_of Hash
+    end
+  end
+  describe '#properties' do
+    it 'gers core properties' do
+      expect(core.properties).to be_kind_of Hash
+      expect(core.properties.keys).to match_array [:encoding,
+        :fieldsTerminatedBy, :linesTerminatedBy, :fieldsEnclosedBy,
+        :ignoreHeaderLines, :rowType ]
+    end
+  end
+  describe '#encoding' do
+    it 'returns encoding of the data' do
+      expect(core.encoding).to eq 'UTF-8'
+    end
+  end
+  describe '#fields_separator' do
+    it 'returns separator of fields for csv files' do
+      expect(core.fields_separator).to be_nil
+    end
+  end
+  describe '#size' do
+    it 'returns number of lines in the core' do
+      expect(core.size).to eq 588
+    end
+  end
+  describe '#file_path' do
+    it 'returns file path of core file' do
+      expect(core.file_path).to match 'DarwinCore.txt'
+    end
+  end
+  describe '#fields' do
+    it 'returns fields of the core file' do
+      expect(core.fields.size).to eq 7
+      expect(core.fields).to be_kind_of Array
+      expect(core.fields[0]).to be_kind_of Hash
+    end
+  end
+  describe '#line_separator' do
+    it 'returns characters separating lines in csv file' do
+      expect(core.line_separator).to eq "\\n"
+    end
+  end
+  describe '#quote_character' do
+    it 'returns quote character for the csv file' do
+      expect(core.quote_character).to eq ''
+    end
+  end
+  describe '#ignore headers' do
+    it 'returns true if headers should not be included into data' do
+      expect(core.ignore_headers).to eq true
+    end
+  end
+end