RubyGems - dwc-archive - Versions diffs - 0.3.1 → 0.4.0 - Mend

dwc-archive 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/README.rdoc +9 -0
data/VERSION +1 -1
data/features/dwca-reader.feature +6 -0
data/features/step_definitions/dwc-reader_steps.rb +9 -3
data/lib/dwc-archive.rb +11 -0
data/lib/dwc-archive/classification_normalizer.rb +139 -0
data/lib/dwc-archive/extension.rb +1 -0
data/lib/dwc-archive/ingester.rb +5 -10
data/spec/files/flat_list.tar.gz +0 -0
data/spec/files/synonyms_in_core_accepted_name_field.tar.gz +0 -0
data/spec/files/synonyms_in_extension.tar.gz +0 -0
data/spec/lib/dwc-archive_spec.rb +60 -1
data/spec/lib/ruby_extenstions_spec.rb +1 -1
metadata +8 -9

data/README.rdoc CHANGED Viewed

@@ -44,6 +44,15 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
     end
     results << [tail_data, tail_errors]
+    # normalize names in classification collecting together synonyms, canonical names,
+    # vernacular names and associating paths to taxons in a classification
+    # distributed as DwCA file
+    # NOTE: this functionality requires biodiversity gem for ruby 1.8.x and
+    # biodiversity19 gem for ruby 1.9.x
+    result = dwc.normalize_classification
     DarwinCore.clean_all   # remove all expanded archives
 == Note on Patches/Pull Requests

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.3.1
1	+ 0.4.0

data/features/dwca-reader.feature CHANGED Viewed

@@ -54,3 +54,9 @@ Feature: Reading of a Darwing Core Archive
     When I create a new DarwinCore instance
     Then I can read its core content using block
     Then I can read extensions content using block
+  Scenario: Normalizing classification
+    Given path to a dwc file "data.tar.gz"
+    When I create a new DarwinCore instance
+    Then I am able to use DarwinCore#normalize_classification method
+    And get normalized classification in expected format

data/features/step_definitions/dwc-reader_steps.rb CHANGED Viewed

@@ -129,9 +129,6 @@ Then /^I can read its content into memory$/ do
   core_data.class.should == Array
   core_data.size.should == 584
   core_errors.size.should == 3
-  core_data, core_errors = @dwc.core.read(5)
-  core_data.size.should == 5
-  core_errors.size.should == 0
 end
 Then /^I can read extensions content into memory$/ do
@@ -164,3 +161,12 @@ Then /^I can read extensions content using block$/ do
   res.should == [[1,0]]
 end
+Then /^I am able to use DarwinCore\#normalize_classification method$/ do
+  @normalized_classification = @dwc.normalize_classification
+end
+Then /^get normalized classification in expected format$/ do
+  @normalized_classification.class.should == Hash
+  key = @normalized_classification.keys[0]
+  @normalized_classification[key].class.should == DarwinCore::TaxonNormalized
+end

data/lib/dwc-archive.rb CHANGED Viewed

@@ -22,6 +22,7 @@ require 'dwc-archive/metadata'
 require 'dwc-archive/generator'
 require 'dwc-archive/generator_meta_xml'
 require 'dwc-archive/generator_eml_xml'
+require 'dwc-archive/classification_normalizer'
 class DarwinCore
   attr_reader :archive, :core, :metadata, :extensions
@@ -29,6 +30,11 @@ class DarwinCore
   DEFAULT_TMP_DIR = "/tmp"
+  def self.nil_field?(field)
+    return true if [nil, '', '/N'].include?(field)
+    false
+  end
   def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
     @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
     @core = DarwinCore::Core.new(@archive)
@@ -36,6 +42,11 @@ class DarwinCore
     @extensions = get_extensions
   end
+  def normalize_classification
+    return nil unless core.fields.map { |f| f[:term].split('/')[-1].downcase }.include? 'highertaxonid'
+    DarwinCore::ClassificationNormalizer.new(self).normalize
+  end
   def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
     Dir.entries(tmp_dir).each do |entry|
       path = File.join(tmp_dir, entry)

data/lib/dwc-archive/classification_normalizer.rb ADDED Viewed

@@ -0,0 +1,139 @@
+# encoding: utf-8
+require 'biodiversity'
+class DarwinCore
+  class TaxonNormalized
+    attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
+    def initialize
+      @id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
+      @synonyms = []
+      @vernacular_names = []
+    end
+  end
+  class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
+  class VernacularNormalized < Struct.new(:name, :language);end
+  class ClassificationNormalizer
+    def initialize(dwc_instance)
+      @dwc = dwc_instance
+      @core = get_fields(@dwc.core)
+      @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
+      @res = {}
+      @parser = ScientificNameParser.new
+    end
+    def normalize
+      injest_core
+      calculate_classification_path
+      injest_extensions
+      @res
+    end
+  private
+    def canonical_name(a_scientific_name)
+      if R19
+        a_scientific_name.force_encoding('utf-8')
+      end
+      begin
+        parsed_name = @parser.parse(a_scientific_name)[:scientificName]
+      rescue
+        @parser = ScientificNameParser.new
+        parsed_name = @parser.parse(a_scientific_name)[:scientificName]
+      end
+      parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
+    end
+    def get_fields(element)
+      data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
+      data[:id] = element.id[:index]
+      data
+    end
+    def status_synonym?(status)
+      status && !!status.match(/^syn/)
+    end
+    def add_synonym_from_core(taxon_id, row)
+      taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
+      taxon.synonyms << SynonymNormalized.new(
+        row[@core[:scientificname]],
+        canonical_name(row[@core[:scientificname]]),
+        row[@core[:taxonomicstatus]])
+    end
+    def injest_core
+      raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
+      @dwc.core.read[0].each do |r|
+        #core has AcceptedNameUsageId
+        if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
+          add_synonym_from_core(@core[:acceptednameusageid], r)
+        elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
+          add_synonym_from_core(@core[:highertaxonid], r)
+        else
+          taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
+          taxon.id = r[@core[:id]]
+          taxon.current_name = r[@core[:scientificname]]
+          taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
+          taxon.parent_id = r[@core[:highertaxonid]]
+          taxon.rank = r[@core[:taxonrank]]
+          taxon.status = r[@core[:taxonomicstatus]]
+        end
+      end
+    end
+    def calculate_classification_path
+      @res.each do |taxon_id, taxon|
+        next if taxon.classification_path
+        get_classification_path(taxon)
+      end
+    end
+    def get_classification_path(taxon)
+      return if taxon.classification_path
+      if DarwinCore.nil_field?(taxon.parent_id)
+        taxon.classification_path = [taxon.current_name_canonical]
+      else
+         parent_cp = @res[taxon.parent_id].classification_path
+        if parent_cp
+          taxon.classification_path = parent_cp + [taxon.current_name_canonical]
+        else
+          get_classification_path(@res[taxon.parent_id])
+          taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
+        end
+      end
+    end
+    def injest_extensions
+      @extensions.each do |e|
+        ext, fields = *e
+        injest_synonyms(e) if fields.keys.include? :scientificname
+        injest_vernaculars(e) if fields.keys.include? :vernacularname
+      end
+    end
+    def injest_synonyms(extension)
+      ext, fields = *extension
+      ext.read[0].each do |r|
+        @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
+          r[fields[:scientificname]],
+          canonical_name(r[fields[:scientificname]]),
+          r[fields[:taxonomicstatus]])
+      end
+    end
+    def injest_vernaculars(extension)
+      ext, fields = *extension
+      ext.read[0].each do |r|
+        @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
+          r[fields[:vernacularname]],
+          r[fields[:languagecode]])
+      end
+    end
+  end
+end

data/lib/dwc-archive/extension.rb CHANGED Viewed

@@ -2,6 +2,7 @@ class DarwinCore
   class Extension
     include DarwinCore::Ingester
     attr_reader :coreid
+    alias :id :coreid
     def initialize(archive, data)
       @archive = archive

data/lib/dwc-archive/ingester.rb CHANGED Viewed

@@ -2,8 +2,7 @@ class DarwinCore
   module Ingester
     attr_reader :data, :properties, :encoding, :fields_separator
     attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
-    def read(batch_size = nil)
+    def read(batch_size = 10000)
       res = []
       errors = []
       index_fix = 1
@@ -13,14 +12,10 @@ class DarwinCore
       CSV.open(@file_path, args).each_with_index do |r, i|
         index_fix = 0; next if @ignore_headers && i == 0
         min_size > r.size ? errors << r : process_csv_row(res, errors, r)
-        if batch_size.to_i > 0 && (i + index_fix) % batch_size == 0
-          if block_given?
-            yield [res, errors]
-            res = []
-            errors = []
-          else
-            return [res, errors]
-          end
+        if block_given? && (i + index_fix) % batch_size == 0
+          yield [res, errors]
+          res = []
+          errors = []
         end
       end
       [res, errors]

data/spec/files/flat_list.tar.gz ADDED Viewed

Binary file

data/spec/files/synonyms_in_core_accepted_name_field.tar.gz ADDED Viewed

Binary file

data/spec/files/synonyms_in_extension.tar.gz ADDED Viewed

Binary file

data/spec/lib/dwc-archive_spec.rb CHANGED Viewed

@@ -1,10 +1,24 @@
-require File.dirname(__FILE__) + "/../spec_helper"
+require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
 describe DarwinCore do
   before(:all) do
     @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
   end
+  describe "::nil_field?" do
+    it "should return true for entries which normally mean nil" do
+      [nil, '/N', ''].each do |i|
+        DarwinCore.nil_field?(i).should be_true
+      end
+    end
+    it "should return false for fields that are not nil" do
+      [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
+        DarwinCore.nil_field?(i).should be_false
+      end
+    end
+  end
   describe ".new" do
     it "should create DarwinCore instance out of archive file" do
       ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
@@ -35,4 +49,49 @@ describe DarwinCore do
       dwc.archive.valid?.should be_true
     end
   end
+  describe ".normalize_classification" do
+    it "should return nil if file has no parent id information" do
+      file = File.join(@file_dir, 'flat_list.tar.gz')
+      dwc = DarwinCore.new(file)
+      dwc.normalize_classification.should be_nil
+    end
+    it "should traverse DarwinCore files and assemble data for every node in memory" do
+      file = File.join(@file_dir, 'data.tar.gz')
+      dwc = DarwinCore.new(file)
+      norm = dwc.normalize_classification
+      norm.class.should == Hash
+      norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
+    end
+    it "should be able to assemble vernacular names from an extension" do
+      file = File.join(@file_dir, 'data.tar.gz')
+      dwc = DarwinCore.new(file)
+      norm = dwc.normalize_classification
+      norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
+    end
+    it "should be able to assemble synonyms from extension" do
+      file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
+      dwc = DarwinCore.new(file)
+      norm = dwc.normalize_classification
+      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
+    end
+    it "should be able to assemble synonyms from extension" do
+      file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
+      dwc = DarwinCore.new(file)
+      norm = dwc.normalize_classification
+      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
+    end
+    it "should be able to assemble synonyms from extension" do
+      file = File.join(@file_dir, 'data.tar.gz')
+      dwc = DarwinCore.new(file)
+      norm = dwc.normalize_classification
+      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
+    end
+  end
 end

data/spec/lib/ruby_extenstions_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require File.dirname(__FILE__) + "/../spec_helper"
+require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
 describe "Hash" do
   it "should parse xml to hash" do

metadata CHANGED Viewed

@@ -1,13 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: dwc-archive
 version: !ruby/object:Gem::Version
-  hash: 17
   prerelease: false
   segments:
   - 0
-  - 3
-  - 1
-  version: 0.3.1
+  - 4
+  - 0
+  version: 0.4.0
 platform: ruby
 authors:
 - Dmitry Mozzherin
@@ -15,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-11 00:00:00 -04:00
+date: 2010-09-09 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 13
         segments:
         - 1
         - 2
@@ -42,7 +40,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
         segments:
         - 0
         version: "0"
@@ -72,6 +69,7 @@ files:
 - lib/dwc-archive.rb
 - lib/dwc-archive/.expander.rb.swo
 - lib/dwc-archive/archive.rb
+- lib/dwc-archive/classification_normalizer.rb
 - lib/dwc-archive/core.rb
 - lib/dwc-archive/errors.rb
 - lib/dwc-archive/expander.rb
@@ -88,10 +86,13 @@ files:
 - spec/files/data.zip
 - spec/files/eml.xml
 - spec/files/file with characters(3).gz
+- spec/files/flat_list.tar.gz
 - spec/files/invalid.tar.gz
 - spec/files/junk_dir_inside.zip
 - spec/files/meta.xml
 - spec/files/minimal.tar.gz
+- spec/files/synonyms_in_core_accepted_name_field.tar.gz
+- spec/files/synonyms_in_extension.tar.gz
 - spec/files/uncompressed
 - spec/lib/dwc-archive_spec.rb
 - spec/lib/ruby_extenstions_spec.rb
@@ -111,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
@@ -120,7 +120,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"