RubyGems - dwc-archive - Versions diffs - 0.9.6 → 0.9.10 - Mend

dwc-archive 0.9.6 → 0.9.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/.gitignore +30 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/.travis.yml +6 -4
data/CHANGELOG +2 -0
data/Gemfile +1 -15
data/README.md +17 -5
data/Rakefile +6 -24
data/] +40 -0
data/dwc-archive.gemspec +33 -0
data/lib/dwc-archive.rb +33 -21
data/lib/dwc-archive/archive.rb +5 -2
data/lib/dwc-archive/classification_normalizer.rb +4 -0
data/lib/dwc-archive/core.rb +2 -2
data/lib/dwc-archive/expander.rb +6 -2
data/lib/dwc-archive/generator.rb +18 -8
data/lib/dwc-archive/generator_eml_xml.rb +16 -14
data/lib/dwc-archive/generator_meta_xml.rb +19 -11
data/lib/dwc-archive/ingester.rb +1 -1
data/lib/dwc-archive/metadata.rb +8 -2
data/lib/dwc-archive/version.rb +3 -0
data/lib/dwc-archive/xml_reader.rb +9 -9
data/spec/lib/classification_normalizer_spec.rb +223 -0
data/spec/lib/core_spec.rb +98 -0
data/spec/lib/darwin_core_spec.rb +279 -0
data/spec/lib/generator_eml_xml_spec.rb +21 -0
data/spec/lib/generator_meta_xml_spec.rb +21 -0
data/spec/lib/generator_spec.rb +116 -0
data/spec/lib/gnub_taxon_spec.rb +34 -0
data/spec/lib/metadata_spec.rb +80 -0
data/spec/lib/taxon_normalized_spec.rb +145 -0
data/spec/lib/xml_reader_spec.rb +13 -10
data/spec/spec_helper.rb +72 -3
metadata +133 -62
data/Gemfile.lock +0 -155
data/VERSION +0 -1
data/lib/dwc-archive/.expander.rb.swo +0 -0
data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
data/spec/lib/dwc-archive_spec.rb +0 -250
data/spec/spec.opts +0 -1

data/Gemfile.lock DELETED Viewed

@@ -1,155 +0,0 @@
-GEM
-  remote: https://rubygems.org/
-  specs:
-    abstract (1.0.0)
-    actionpack (3.0.8)
-      activemodel (= 3.0.8)
-      activesupport (= 3.0.8)
-      builder (~> 2.1.2)
-      erubis (~> 2.6.6)
-      i18n (~> 0.5.0)
-      rack (~> 1.2.1)
-      rack-mount (~> 0.6.14)
-      rack-test (~> 0.5.7)
-      tzinfo (~> 0.3.23)
-    activemodel (3.0.8)
-      activesupport (= 3.0.8)
-      builder (~> 2.1.2)
-      i18n (~> 0.5.0)
-    activesupport (3.0.8)
-    archive-tar-minitar (0.5.2)
-    awesome_print (1.1.0)
-    binding_of_caller (0.7.1)
-      debug_inspector (>= 0.0.1)
-    biodiversity (3.1.0)
-      parallel
-      parallel (~> 0.6)
-      rake (~> 10.0)
-      treetop
-      treetop (~> 1.4)
-      unicode_utils (~> 1.4)
-    builder (2.1.2)
-    coderay (1.0.9)
-    columnize (0.3.6)
-    coolline (0.4.2)
-    cucumber (1.3.1)
-      builder (>= 2.1.2)
-      diff-lcs (>= 1.1.3)
-      gherkin (~> 2.12.0)
-      multi_json (~> 1.3)
-    debug_inspector (0.0.2)
-    debugger (1.5.0)
-      columnize (>= 0.3.1)
-      debugger-linecache (~> 1.2.0)
-      debugger-ruby_core_source (~> 1.2.0)
-    debugger-linecache (1.2.0)
-    debugger-ruby_core_source (1.2.0)
-    diff-lcs (1.2.4)
-    diffy (2.1.4)
-    erubis (2.6.6)
-      abstract (>= 1.0.0)
-    gherkin (2.12.0)
-      multi_json (~> 1.3)
-    git (1.2.5)
-    grit (2.5.0)
-      diff-lcs (~> 1.1)
-      mime-types (~> 1.15)
-      posix-spawn (~> 0.3.6)
-    hirb (0.7.1)
-    i18n (0.5.0)
-    jazz_hands (0.5.0)
-      awesome_print (~> 1.1.0)
-      coderay (~> 1.0.9)
-      coolline (>= 0.4.0)
-      hirb (~> 0.7.1)
-      pry (~> 0.9.12)
-      pry-debugger (~> 0.2.2)
-      pry-doc (~> 0.4.4)
-      pry-git (~> 0.2.3)
-      pry-rails (~> 0.2.2)
-      pry-remote (>= 0.1.7)
-      pry-stack_explorer (~> 0.4.9)
-      railties (>= 3.0, < 5.0)
-    jeweler (1.8.4)
-      bundler (~> 1.0)
-      git (>= 1.2.5)
-      rake
-      rdoc
-    json (1.7.7)
-    method_source (0.8.1)
-    mime-types (1.23)
-    multi_json (1.7.3)
-    nokogiri (1.5.9)
-    parallel (0.7.0)
-    parsley-store (0.3.2)
-      biodiversity (~> 3.1.0)
-      jeweler (~> 1.8)
-      redis (~> 3.0)
-    polyglot (0.3.3)
-    posix-spawn (0.3.6)
-    pry (0.9.12.1)
-      coderay (~> 1.0.5)
-      method_source (~> 0.8)
-      slop (~> 3.4)
-    pry-debugger (0.2.2)
-      debugger (~> 1.3)
-      pry (~> 0.9.10)
-    pry-doc (0.4.5)
-      pry (>= 0.9)
-      yard (>= 0.8)
-    pry-git (0.2.3)
-      diffy
-      grit
-      pry (>= 0.9.8)
-    pry-rails (0.2.2)
-      pry (>= 0.9.10)
-    pry-remote (0.1.7)
-      pry (~> 0.9)
-      slop (~> 3.0)
-    pry-stack_explorer (0.4.9)
-      binding_of_caller (>= 0.7)
-      pry (~> 0.9.11)
-    rack (1.2.8)
-    rack-mount (0.6.14)
-      rack (>= 1.0.0)
-    rack-test (0.5.7)
-      rack (>= 1.0)
-    railties (3.0.8)
-      actionpack (= 3.0.8)
-      activesupport (= 3.0.8)
-      rake (>= 0.8.7)
-      thor (~> 0.14.4)
-    rake (10.0.4)
-    rdoc (4.0.1)
-      json (~> 1.4)
-    redis (3.0.4)
-    rspec (2.13.0)
-      rspec-core (~> 2.13.0)
-      rspec-expectations (~> 2.13.0)
-      rspec-mocks (~> 2.13.0)
-    rspec-core (2.13.1)
-    rspec-expectations (2.13.0)
-      diff-lcs (>= 1.1.3, < 2.0)
-    rspec-mocks (2.13.1)
-    slop (3.4.4)
-    thor (0.14.6)
-    treetop (1.4.14)
-      polyglot
-      polyglot (>= 0.3.1)
-    tzinfo (0.3.37)
-    unicode_utils (1.4.0)
-    yard (0.8.6.1)
-PLATFORMS
-  ruby
-DEPENDENCIES
-  archive-tar-minitar (~> 0.5)
-  bundler (~> 1.3)
-  cucumber (~> 1.3)
-  debugger (~> 1.3)
-  jazz_hands (~> 0.5)
-  jeweler (~> 1.8)
-  nokogiri (~> 1.5)
-  parsley-store (~> 0.3.2)
-  rspec (~> 2.13)

data/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.9.6

data/lib/dwc-archive/.expander.rb.swo DELETED Viewed

Binary file

data/lib/dwc-archive/utf_regex_ruby18.rb DELETED Viewed

@@ -1,10 +0,0 @@
-UTF8RGX = /\A(
-      [\x09\x0A\x0D\x20-\x7E]            # ASCII
-    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
-    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
-    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
-    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
-    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
-    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
-    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
-  )*\z/x

data/spec/lib/dwc-archive_spec.rb DELETED Viewed

@@ -1,250 +0,0 @@
-# encoding: utf-8
-require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
-describe DarwinCore do
-  before(:all) do
-    @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
-  end
-  describe "VERSION" do
-    it "should return VERSION number" do
-      DarwinCore::VERSION.split('.').join('').to_i.should > 41
-    end
-  end
-  describe "::nil_field?" do
-    it "should return true for entries which normally mean nil" do
-      [nil, '/N', ''].each do |i|
-        DarwinCore.nil_field?(i).should be_true
-      end
-    end
-    it "should return false for fields that are not nil" do
-      [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
-        DarwinCore.nil_field?(i).should be_false
-      end
-    end
-  end
-  describe ".new" do
-    it "should create DarwinCore instance out of archive file" do
-      ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
-        file = File.join(@file_dir, file)
-        dwc = DarwinCore.new(file)
-        dwc.archive.valid?.should be_true
-      end
-    end
-    it "should raise an error if archive file does not exist" do
-      file = 'not_a_file'
-      lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::FileNotFoundError)
-    end
-    it "should raise an error if archive is broken" do
-      file = File.join(@file_dir, 'broken.tar.gz')
-      lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::UnpackingError)
-    end
-    it "should raise an error if archive is invalid" do
-      file = File.join(@file_dir, 'invalid.tar.gz')
-      lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::InvalidArchiveError)
-    end
-    it "should raise an error if archive is not in utf-8" do
-      file = File.join(@file_dir, 'latin1.tar.gz')
-      lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::EncodingError)
-    end
-    it "should work with files that have non-alfanumeric characters and spaces" do
-      file = File.join(@file_dir, 'file with characters(3).gz')
-      dwc = DarwinCore.new(file)
-      dwc.archive.valid?.should be_true
-    end
-  end
-  describe ".normalize_classification" do
-    it "should return flat list if file has no parent id information" do
-      file = File.join(@file_dir, 'flat_list.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      cn.normalize
-      cn.normalized_data.should_not be_nil
-      cn.normalized_data.size.should > 0
-    end
-    it "should return array or hash of name_strings back" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      cn.normalize
-      name_strings = cn.name_strings
-      name_strings.is_a?(Array).should be_true
-      name_strings.size.should > 1
-      name_strings = cn.name_strings(with_hash: true)
-      name_strings.size.should > 1
-      name_strings.is_a?(Hash).should be_true
-      name_strings.is_a?(Hash).should be_true
-      name_strings.values.uniq.should == [1]
-      vernacular_name_strings = cn.vernacular_name_strings
-      vernacular_name_strings.is_a?(Array).should be_true
-      vernacular_name_strings.size.should > 0
-      vernacular_name_strings = cn.vernacular_name_strings(with_hash: true)
-      vernacular_name_strings.size.should > 0
-      vernacular_name_strings.is_a?(Hash).should be_true
-      vernacular_name_strings.values.uniq.should == [1]
-    end
-    it "should traverse DarwinCore files and assemble data for every node in memory" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      norm.class.should == Hash
-      path_encodings = []
-      norm.each do |taxon_id, taxon|
-        taxon.classification_path.each {|p| path_encodings << p.encoding}
-      end
-      path_encodings.uniq!
-      path_encodings.size.should == 1
-      path_encodings[0].to_s.should == "UTF-8"
-      norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
-      norm['leptogastrinae:tid:2857'].source.should == 'http://leptogastrinae.lifedesks.org/pages/2857'
-    end
-    it "should assemble synonyms from core" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      syn = norm.values.select {|n| n.synonyms.size > 0}[0].synonyms[0]
-      syn.id.should == 'leptogastrinae:tid:127'
-      syn.name.should == "Leptogastridae"
-      syn.source.should == 'http://leptogastrinae.lifedesks.org/pages/127'
-    end
-    it "should be able to assemble vernacular names from an extension" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
-    end
-    it "should be able to assemble synonyms from extension" do
-      file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
-    end
-    it "should not assemble synonyms from extension with scientificName, and file name not matching 'synonym'" do
-      file = File.join(@file_dir, 'not_synonym_in_extension.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should == 0
-    end
-    it "should not attempt to assemble extensions with with_extensions opts set to false" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      norm = cn.normalize(:with_extensions => false)
-      norm.select { |k,v| !v.vernacular_names.empty? }.size.should == 0
-      norm = cn.normalize()
-      norm.select { |k,v| !v.vernacular_names.empty? }.size.should > 0
-      file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      norm = cn.normalize(:with_extensions => false)
-      norm.select { |k,v| !v.synonyms.empty? }.size.should == 0
-      norm = cn.normalize()
-      norm.select { |k,v| !v.synonyms.empty? }.size.should > 0
-    end
-    it "should assemble linnean classification if terms for it exists" do
-      file = File.join(@file_dir, 'linnean.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      norm = cn.normalize
-      cn.normalized_data.first.last.linnean_classification_path.should == [["Animalia", :kingdom], ["Arthropoda", :phylum], ["Insecta", :class], ["Diptera", :order], ["Cecidomyiidae", :family], ["Resseliella", :genus]]
-    end
-    it "should keep linnean classification empty if terms are not there" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      norm = cn.normalize
-      cn.normalized_data.first.last.linnean_classification_path.should == []
-    end
-    it "should be able to assemble synonyms from core" do
-      file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
-    end
-    it "should be able to assemble synonyms from extension" do
-      file = File.join(@file_dir, 'data.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      nodes_with_syn = norm.select { |k,v| !v.synonyms.empty? }
-      nodes_with_syn.map { |k,v| v.synonyms }.size.should > 0
-      nodes_with_syn.first[1].synonyms.first.status.should == 'synonym'
-    end
-    it "should be able work with files which have scientificNameAuthorship" do
-      file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      norm = cn.normalize
-      path_encodings = norm.map {|taxon_id, taxon| taxon.classification_path}.flatten.map { |name| name.encoding.to_s }.uniq
-      path_encodings.size.should == 1
-      path_encodings[0].should == "UTF-8"
-      taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size >  v.current_name_canonical.split(" ").size]}
-      taxa.size.should == 507
-      syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size  > s.canonical_name.split(" ").size}
-      syn.size.should == 50
-    end
-    it "should be able work with files which repeat scientificNameAuthorship value in scientificName field" do
-      file = File.join(@file_dir, 'sci_name_authorship_dup.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size >  v.current_name_canonical.split(" ").size]}
-      taxa.size.should == 507
-      syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size  > s.canonical_name.split(" ").size}
-      syn.size.should == 50
-    end
-    it "should be able open files where coreid is empty" do
-      file = File.join(@file_dir, 'empty_coreid.tar.gz')
-      dwc = DarwinCore.new(file)
-      norm = dwc.normalize_classification
-      taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size >  v.current_name_canonical.split(" ").size]}
-      taxa.size.should == 2
-    end
-    it "should be able to get language and locality fields for vernacular names" do
-      file = File.join(@file_dir, 'language_locality.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      cn.normalize
-      vn = cn.normalized_data['leptogastrinae:tid:42'].vernacular_names.first
-      vn.language.should == 'en'
-      vn.locality.should == 'New England'
-    end
-    it 'should be able to get uuids from gnub dataset' do
-      file = File.join(@file_dir, 'gnub.tar.gz')
-      dwc = DarwinCore.new(file)
-      cn = DarwinCore::ClassificationNormalizer.new(dwc)
-      cn.normalize
-      vn = cn.normalized_data['9c399f90-cfb8-5a7f-9a21-18285a473488']
-      vn.class.should == DarwinCore::GnubTaxon
-      vn.uuid.should == '8faa91f6-663f-4cfe-b785-0ab4e9415a51'
-      vn.uuid_path.should == [
-        "9a9f9eeb-d5f9-4ff6-b6cb-a5ad345e33c3",
-        "bf4c91c0-3d1f-44c7-9d3b-249382182a26",
-        "8faa91f6-663f-4cfe-b785-0ab4e9415a51"]
-    end
-  end
-end

data/spec/spec.opts DELETED Viewed

	@@ -1 +0,0 @@
1	- --color