RubyGems - dwc-archive - Versions diffs - 0.9.10 → 1.1.2 - Mend

dwc-archive 0.9.10 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.rspec +2 -1
data/.rubocop.yml +23 -0
data/.ruby-version +1 -1
data/.travis.yml +4 -7
data/CHANGELOG +14 -8
data/Gemfile +3 -1
data/LICENSE +1 -1
data/README.md +119 -107
data/Rakefile +13 -36
data/dwc-archive.gemspec +23 -19
data/features/step_definitions/dwc-creator_steps.rb +5 -5
data/features/step_definitions/dwc-reader_steps.rb +47 -28
data/features/support/env.rb +1 -1
data/lib/dwc_archive.rb +124 -0
data/lib/dwc_archive/archive.rb +60 -0
data/lib/dwc_archive/classification_normalizer.rb +382 -0
data/lib/dwc_archive/core.rb +25 -0
data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
data/lib/dwc_archive/expander.rb +88 -0
data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
data/lib/dwc_archive/generator.rb +91 -0
data/lib/dwc_archive/generator_eml_xml.rb +116 -0
data/lib/dwc_archive/generator_meta_xml.rb +72 -0
data/lib/dwc_archive/gnub_taxon.rb +14 -0
data/lib/dwc_archive/ingester.rb +106 -0
data/lib/dwc_archive/metadata.rb +57 -0
data/lib/dwc_archive/taxon_normalized.rb +23 -0
data/lib/dwc_archive/version.rb +6 -0
data/lib/dwc_archive/xml_reader.rb +90 -0
data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
data/spec/files/generator_eml.xml +47 -0
data/spec/files/generator_meta.xml +19 -0
data/spec/lib/classification_normalizer_spec.rb +96 -105
data/spec/lib/core_spec.rb +43 -41
data/spec/lib/darwin_core_spec.rb +108 -138
data/spec/lib/generator_eml_xml_spec.rb +12 -11
data/spec/lib/generator_meta_xml_spec.rb +12 -11
data/spec/lib/generator_spec.rb +77 -69
data/spec/lib/gnub_taxon_spec.rb +15 -17
data/spec/lib/metadata_spec.rb +50 -41
data/spec/lib/taxon_normalized_spec.rb +62 -65
data/spec/lib/xml_reader_spec.rb +9 -12
data/spec/spec_helper.rb +54 -51
metadata +105 -88
data/.rvmrc +0 -1
data/] +0 -40
data/lib/dwc-archive.rb +0 -107
data/lib/dwc-archive/archive.rb +0 -40
data/lib/dwc-archive/classification_normalizer.rb +0 -428
data/lib/dwc-archive/core.rb +0 -17
data/lib/dwc-archive/expander.rb +0 -84
data/lib/dwc-archive/generator.rb +0 -85
data/lib/dwc-archive/generator_eml_xml.rb +0 -86
data/lib/dwc-archive/generator_meta_xml.rb +0 -58
data/lib/dwc-archive/ingester.rb +0 -101
data/lib/dwc-archive/metadata.rb +0 -48
data/lib/dwc-archive/version.rb +0 -3
data/lib/dwc-archive/xml_reader.rb +0 -64

data/features/step_definitions/dwc-creator_steps.rb CHANGED

@@ -19,7 +19,7 @@ end
 Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
   file = File.join(@gen.path, file_name)
-  @gen.files.include?(file_name).should be_true
+  @gen.files.include?(file_name).should be true
   csv = CSV.open(file).count.should == 4
 end
@@ -51,7 +51,7 @@ end
 Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
   [file_name_1, file_name_2].each do |file_name|
     file = File.join(@gen.path, file_name)
-    @gen.files.include?(file_name).should be_true
+    @gen.files.include?(file_name).should be true
     csv = CSV.open(file).count.should > 1
   end
 end
@@ -86,7 +86,7 @@ end
 Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
   meta = File.join(@gen.path, file_name)
-  @gen.files.include?(file_name).should be_true
+  @gen.files.include?(file_name).should be true
   dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
   dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
   dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
@@ -94,7 +94,7 @@ end
 Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
   eml = File.join(@gen.path, file_name)
-  @gen.files.include?(file_name).should be_true
+  @gen.files.include?(file_name).should be true
 end
 Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
@@ -107,6 +107,6 @@ end
 Then /^there should be a valid new archive file$/ do
   dwc = DarwinCore.new('/tmp/dwc.tar.gz')
-  dwc.archive.valid?.should be_true
+  dwc.archive.valid?.should be true
 end

data/features/step_definitions/dwc-reader_steps.rb CHANGED

@@ -1,5 +1,6 @@
 Given /^path to a dwc file "([^\"]*)"$/ do |arg1|
-  @dwca_file = File.expand_path(File.dirname(__FILE__) + "../../../spec/files/" + arg1)
+  @dwca_file = File.expand_path(File.dirname(__FILE__) +
+                                "../../../spec/files/" + arg1)
   @tmp_dir = "/tmp"
 end
@@ -8,11 +9,12 @@ When /^I create a new DarwinCore::Archive instance$/ do
 end
 Then /^I should find that the archive is valid$/ do
-  @dwca.valid?.should be_true
+  @dwca.valid?.should be true
 end
 Then /^I should see what files the archive has$/ do
-  @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml", "meta.xml", "metadata.txt"]
+  @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
+                         "meta.xml", "metadata.txt"]
 end
 When /^I delete expanded files$/ do
@@ -20,7 +22,7 @@ When /^I delete expanded files$/ do
 end
 Then /^they should disappear$/ do
-  @dwca.files.should be_nil
+  @dwca.files.should be nil
 end
 When /^I create a new DarwinCore instance$/ do
@@ -37,7 +39,7 @@ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
 end
 Then /^instance should have a valid archive$/ do
-  @dwc.archive.valid?.should be_true
+  @dwc.archive.valid?.should be true
 end
 Then /^instance should have a core$/ do
@@ -45,7 +47,7 @@ Then /^instance should have a core$/ do
 end
 Then /^I should see checksum$/ do
-  @dwc.checksum.should == '7d94fc28ffaf434b66fbc790aa5ef00d834057bf'
+  @dwc.checksum.should == "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
 end
 When /^I check core data$/ do
@@ -64,7 +66,8 @@ And /^core\.file_path$/ do
 end
 And /^core\.id$/ do
-  @core.id.should == {:index => 0, :term => 'http://rs.tdwg.org/dwc/terms/TaxonID'}
+  @core.id.should == {index: 0,
+                      term: "http://rs.tdwg.org/dwc/terms/TaxonID"}
 end
 And /^core\.fields$/ do
@@ -80,14 +83,21 @@ Then /^DarwinCore instance should have dwc\.metadata object$/ do
 end
 And /^I should find id, title, creators, metadata provider$/ do
-  @dwc.metadata.id.should == 'leptogastrinae:version:2.5'
-  @dwc.metadata.title.should == 'Leptogastrinae (Diptera: Asilidae) Classification'
+  @dwc.metadata.id.should == "leptogastrinae:version:2.5"
+  @dwc.metadata.title.should ==
+    "Leptogastrinae (Diptera: Asilidae) Classification"
   @dwc.metadata.authors.should == [
-      {:last_name=>"Bayless", :email=>"keith.bayless@gmail.com", :first_name=>"Keith"},
-      {:last_name=>"Dikow", :email=>"dshorthouse@eol.org", :first_name=>"Torsten"}]
-  @dwc.metadata.abstract.should == 'These are all the names in the Leptogastrinae classification.'
-  @dwc.metadata.citation.should == 'Dikow, Torsten. 2010. The Leptogastrinae classification.'
-  @dwc.metadata.url.should == 'http://leptogastrinae.lifedesks.org/files/leptogastrinae/classification_export/shared/leptogastrinae.tar.gz'
+    { last_name: "Bayless", email: "keith.bayless@gmail.com",
+      first_name: "Keith" },
+    { last_name: "Dikow", email: "dshorthouse@eol.org", first_name: "Torsten" }
+  ]
+  @dwc.metadata.abstract.should ==
+    "These are all the names in the Leptogastrinae classification."
+  @dwc.metadata.citation.should ==
+    "Dikow, Torsten. 2010. The Leptogastrinae classification."
+  @dwc.metadata.url.should ==
+    "http://leptogastrinae.lifedesks.org/files/leptogastrinae/"\
+    "classification_export/shared/leptogastrinae.tar.gz"
 end
 Then /^DarwinCore instance should have an extensions array$/ do
@@ -103,11 +113,18 @@ end
 Then /^extension should have properties, data, file_path, coreid, fields$/ do
   ext = @dwc.extensions[0]
-  ext.properties.should == {:ignoreHeaderLines=>1, :encoding=>"UTF-8", :rowType=>"http://rs.gbif.org/ipt/terms/1.0/VernacularName", :fieldsEnclosedBy=>"", :fieldsTerminatedBy=>"\\t", :linesTerminatedBy=>"\\n"}
+  ext.properties.should == {
+    ignoreHeaderLines: 1, encoding: "UTF-8",
+    rowType: "http://rs.gbif.org/ipt/terms/1.0/VernacularName",
+    fieldsEnclosedBy: "", fieldsTerminatedBy: "\\t", linesTerminatedBy: "\\n"
+  }
   ext.data.class.should == Hash
   ext.file_path.should match(/\/tmp\/dwc_[\d]+\/VernacularName.txt/)
-  ext.coreid.should == {:index=>0}
-  ext.fields.should == [{:term=>"http://rs.gbif.org/ecat/terms/vernacularName", :index=>1}, {:term=>"http://rs.gbif.org/thesaurus/languageCode", :index=>2}]
+  ext.coreid.should == { index: 0 }
+  ext.fields.should == [
+    { term: "http://rs.gbif.org/ecat/terms/vernacularName", index: 1 },
+    { term: "http://rs.gbif.org/thesaurus/languageCode", index: 2 }
+  ]
 end
 Given /^acces to DarwinCore gem$/ do
@@ -153,7 +170,7 @@ end
 Then /^I can read its core content using block$/ do
   res = []
-  @dwc.core.ignore_headers.should be_true
+  @dwc.core.ignore_headers.should be true
   read_result = @dwc.core.read(200) do |r, err|
     res << [r.size, err.size]
   end
@@ -164,7 +181,7 @@ end
 Then /^I can read extensions content using block$/ do
   res = []
   ext = @dwc.extensions[0]
-  ext.ignore_headers.should be_true
+  ext.ignore_headers.should be true
   ext.read(200) do |r, err|
     res << [r.size, err.size]
   end
@@ -196,22 +213,24 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
     if v.vernacular_names.size > 0
       @vernaculars_are_generated = true
       vn = v.vernacular_names[0]
-      (vn.respond_to?('locality') && vn.respond_to?('country_code') && vn.respond_to?('language')).should be_true
+      (vn.respond_to?("locality") && vn.respond_to?("country_code") &&
+       vn.respond_to?("language")).should be true
     end
     break if (@vernaculars_are_generated && @paths_are_generated && @synonyms_are_generated)
   end
-  @paths_are_generated.should be_true
-  @vernaculars_are_generated.should be_true
-  @synonyms_are_generated.should be_true
+  @paths_are_generated.should be true
+  @vernaculars_are_generated.should be true
+  @synonyms_are_generated.should be true
 end
 Then /^there are local_id and global_id methods in taxons and synonyms$/ do
   @normalized_classification.each do |k, v|
     if v.synonyms.size > 0
-      v.local_id.should == '2'
+      v.local_id.should == "2"
       v.global_id.should == "97498f29-2501-440d-9452-f3817da0d6c2"
-      v.synonyms.first.local_id.should == '1'
-      v.synonyms.first.global_id.should == "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
+      v.synonyms.first.local_id.should == "1"
+      v.synonyms.first.global_id.should ==
+        "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
       break
     end
   end
@@ -229,8 +248,8 @@ Then /^there are id paths, no canonical names paths in normalized classification
       id_paths_generated = true
     end
   end
-  id_paths_generated.should be_true
-  canonical_paths_generated.should be_false
+  id_paths_generated.should be true
+  canonical_paths_generated.should be false
 end
 Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|

data/features/support/env.rb CHANGED

@@ -1,4 +1,4 @@
 $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
-require 'dwc-archive'
+require 'dwc_archive'
 require 'rspec/expectations'

data/lib/dwc_archive.rb ADDED

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+require "fileutils"
+require "ostruct"
+require "digest"
+require "csv"
+require "logger"
+require "nokogiri"
+require "biodiversity"
+require_relative "dwc_archive/xml_reader"
+require_relative "dwc_archive/ingester"
+require_relative "dwc_archive/errors"
+require_relative "dwc_archive/expander"
+require_relative "dwc_archive/archive"
+require_relative "dwc_archive/core"
+require_relative "dwc_archive/extension"
+require_relative "dwc_archive/metadata"
+require_relative "dwc_archive/generator"
+require_relative "dwc_archive/generator_meta_xml"
+require_relative "dwc_archive/generator_eml_xml"
+require_relative "dwc_archive/taxon_normalized"
+require_relative "dwc_archive/gnub_taxon"
+require_relative "dwc_archive/classification_normalizer"
+require_relative "dwc_archive/version"
+# main class for handling darwin core archives
+class DarwinCore
+  DEFAULT_TMP_DIR = "/tmp"
+  VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
+  SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
+                                 :local_id, :global_id)
+  class << self
+    attr_writer :logger
+    def clean(path)
+      FileUtils.rm_rf(path) if FileTest.exists?(path)
+    end
+    def files(path)
+      return nil unless path && FileTest.exists?(path)
+      Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
+    end
+    def random_path(tmp_dir)
+      File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
+    end
+  end
+  attr_reader :archive, :core, :metadata, :classification_normalizer
+  alias eml metadata
+  def self.nil_field?(field)
+    return true if [nil, "", "/N"].include?(field)
+    false
+  end
+  def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
+    Dir.entries(tmp_dir).each do |entry|
+      path = File.join(tmp_dir, entry)
+      FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
+    end
+  end
+  def self.logger
+    @logger ||= Logger.new(nil)
+  end
+  def self.logger_reset
+    self.logger = Logger.new(nil)
+  end
+  def self.logger_write(obj_id, message, method = :info)
+    logger.send(method, "|#{obj_id}|#{message}|")
+  end
+  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
+    @dwc_path = dwc_path
+    @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
+    @core = DarwinCore::Core.new(self)
+    @metadata = DarwinCore::Metadata.new(@archive)
+    extensions
+  end
+  def file_name
+    File.split(@dwc_path).last
+  end
+  def path
+    File.expand_path(@dwc_path)
+  end
+  # generates a hash from a classification data with path to each node,
+  # list of synonyms and vernacular names.
+  def normalize_classification
+    return nil unless parent_id?
+    @classification_normalizer ||=
+      DarwinCore::ClassificationNormalizer.new(self)
+    @classification_normalizer.normalize
+  end
+  def parent_id?
+    !@core.fields.join("|").
+      downcase.match(/highertaxonid|parentnameusageid/).nil?
+  end
+  def checksum
+    Digest::SHA1.hexdigest(File.read(@dwc_path))
+  end
+  def extensions
+    return @extensions if @extensions
+    root_key = @archive.meta.keys[0]
+    ext = @archive.meta[root_key][:extension]
+    return @extensions = [] unless ext
+    ext = [ext] if ext.class != Array
+    @extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
+  end
+end

data/lib/dwc_archive/archive.rb ADDED

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Deals with handling DarwinCoreArchive file, and provides meta information
+  # and files information about archive
+  class Archive
+    attr_reader :meta, :eml
+    def initialize(archive_path, tmp_dir)
+      @archive_path = archive_path
+      @tmp_dir = tmp_dir
+      @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
+      @expander.unpack
+      prepare_metadata
+    end
+    def valid?
+      valid = true
+      valid = valid && @expander.path && FileTest.exists?(@expander.path)
+      valid && files && files.include?("meta.xml")
+    end
+    def files
+      @expander.files
+    end
+    def files_path
+      @expander.path
+    end
+    def clean
+      @expander.clean
+    end
+    private
+    def prepare_metadata
+      if valid?
+        prepare_meta_file
+        prepare_eml_file
+      else
+        clean
+        raise InvalidArchiveError
+      end
+    end
+    def prepare_meta_file
+      meta_file = File.open(File.join(@expander.path, "meta.xml"))
+      @meta = DarwinCore::XmlReader.from_xml(meta_file)
+    end
+    def prepare_eml_file
+      @eml = nil
+      return unless files.include?("eml.xml")
+      eml_file = File.open(File.join(@expander.path, "eml.xml"))
+      @eml = DarwinCore::XmlReader.from_xml(eml_file)
+    end
+  end
+end

data/lib/dwc_archive/classification_normalizer.rb ADDED

@@ -0,0 +1,382 @@
+# frozen_string_literal: true
+class DarwinCore
+  # Returns tree representation of Darwin Core file with vernacular and
+  # and synonyms attached to the taxon nodes
+  class ClassificationNormalizer
+    attr_reader :error_names, :tree, :normalized_data, :dwc
+    alias darwin_core dwc
+    def initialize(dwc_instance)
+      @dwc = dwc_instance
+      @core_fields = find_fields(@dwc.core)
+      @extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
+      @normalized_data = {}
+      @synonyms = {}
+      @name_strings = {}
+      @vernacular_name_strings = {}
+      @error_names = []
+      @tree = {}
+    end
+    def add_name_string(name_string)
+      @name_strings[name_string] = 1 unless @name_strings[name_string]
+    end
+    def add_vernacular_name_string(name_string)
+      return if @vernacular_name_strings[name_string]
+      @vernacular_name_strings[name_string] = 1
+    end
+    def name_strings(opts = {})
+      process_strings(@name_strings, opts)
+    end
+    def vernacular_name_strings(opts = {})
+      process_strings(@vernacular_name_strings, opts)
+    end
+    def normalize(opts = {})
+      opts = { with_canonical_names: true,
+               with_extensions: true }.merge(opts)
+      @with_canonical_names = opts[:with_canonical_names]
+      DarwinCore.logger_write(@dwc.object_id,
+                              "Started normalization of the classification")
+      ingest_core
+      DarwinCore.logger_write(
+        @dwc.object_id,
+        "Calculating the classification parent/child paths"
+      )
+      if parent_id?
+        calculate_classification_path
+      else
+        @normalized_data.keys.each { |id| @tree[id] = {} }
+      end
+      DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
+      ingest_extensions if opts[:with_extensions]
+      @normalized_data
+    end
+    private
+    def process_strings(strings, opts)
+      opts = { with_hash: false }.merge(opts)
+      if opts[:with_hash]
+        strings
+      else
+        strings.keys
+      end
+    end
+    def get_canonical_name(a_scientific_name)
+      return nil unless @with_canonical_names
+      canonical_name = Biodiversity::Parser.parse(a_scientific_name).
+                       dig(:canonical, :simple)
+      canonical_name.to_s.empty? ? a_scientific_name : canonical_name
+    end
+    def find_fields(element)
+      data = element.fields.each_with_object({}) do |f, h|
+        field = f[:term].split("/")[-1]
+        field = field ? field.downcase.to_sym : ""
+        h[field] = f[:index].to_i
+      end
+      data[:id] = element.id[:index] if element.id
+      data
+    end
+    def status_synonym?(status)
+      status&.match(/^syn/)
+    end
+    def add_synonym_from_core(taxon_id, row)
+      cf = @core_fields
+      @synonyms[row[cf[:id]]] = taxon_id
+      @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
+      taxon = @normalized_data[row[taxon_id]]
+      synonym = SynonymNormalized.new(
+        row[cf[:id]],
+        row[cf[:scientificname]],
+        row[cf[:canonicalname]],
+        cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
+        cf[:source] ? row[cf[:source]] : nil,
+        cf[:localid] ? row[cf[:localid]] : nil,
+        cf[:globalid] ? row[cf[:globalid]] : nil
+      )
+      taxon.synonyms << synonym
+      add_name_string(synonym.name)
+      add_name_string(synonym.canonical_name)
+    end
+    def set_scientific_name(row, fields)
+      row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
+      canonical_name = nil
+      scientific_name = row[fields[:scientificname]].strip
+      if separate_canonical_and_authorship?(row, fields)
+        canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
+        scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
+      else
+        canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
+      end
+      fields[:canonicalname] = row.size
+      row << canonical_name
+      row[fields[:scientificname]] = scientific_name
+    end
+    def separate_canonical_and_authorship?(row, fields)
+      authorship = ""
+      authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
+      !(authorship.empty? || row[fields[:scientificname]].index(authorship))
+    end
+    def ingest_core
+      @normalized_data = {}
+      has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
+      unless has_name_and_id
+        raise(DarwinCore::CoreFileError,
+              "Darwin Core core fields must contain taxon id and scientific name")
+      end
+      @dwc.core.read do |rows|
+        rows[1].each do |error|
+          @error_names << { data: error,
+                            error: :reading_or_encoding_error }
+        end
+        rows[0].each do |r|
+          set_scientific_name(r, @core_fields)
+          # Core has AcceptedNameUsageId
+          if @core_fields[:acceptednameusageid] &&
+             r[@core_fields[:acceptednameusageid]] &&
+             r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
+            add_synonym_from_core(@core_fields[:acceptednameusageid], r)
+          elsif !@core_fields[:acceptednameusageid] &&
+                @core_fields[:taxonomicstatus] &&
+                status_synonym?(r[@core_fields[:taxonomicstatus]])
+            add_synonym_from_core(parent_id, r) if parent_id?
+          else
+            unless @normalized_data[r[@core_fields[:id]]]
+              new_taxon = if gnub_archive?
+                            DarwinCore::GnubTaxon.new
+                          else
+                            DarwinCore::TaxonNormalized.new
+                          end
+              @normalized_data[r[@core_fields[:id]]] = new_taxon
+            end
+            taxon = @normalized_data[r[@core_fields[:id]]]
+            if gnub_archive?
+              taxon.uuid = r[@core_fields[:originalnameusageid]]
+              taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
+                                split("|")
+            end
+            taxon.id = r[@core_fields[:id]]
+            taxon.current_name = r[@core_fields[:scientificname]]
+            taxon.current_name_canonical = r[@core_fields[:canonicalname]]
+            taxon.parent_id = parent_id? ? r[parent_id] : nil
+            taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
+            taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
+            taxon.source = r[@core_fields[:source]] if @core_fields[:source]
+            taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
+            taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
+            taxon.linnean_classification_path =
+              get_linnean_classification_path(r, taxon)
+            add_name_string(taxon.current_name)
+            has_canonical = taxon.current_name_canonical &&
+                            !taxon.current_name_canonical.empty?
+            add_name_string(taxon.current_name_canonical) if has_canonical
+          end
+        end
+      end
+    end
+    def parent_id?
+      @has_parent_id ||= @core_fields.key?(:highertaxonid) ||
+                         @core_fields.key?(:parentnameusageid)
+    end
+    def parent_id
+      @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
+    end
+    def calculate_classification_path
+      @paths_num = 0
+      @normalized_data.each do |_taxon_id, taxon|
+        next unless taxon.classification_path_id.empty?
+        res = get_classification_path(taxon)
+        next if res == "error"
+      end
+    end
+    def get_classification_path(taxon)
+      return unless taxon.classification_path_id.empty?
+      @paths_num += 1
+      if @paths_num % 10_000 == 0
+        DarwinCore.logger_write(@dwc.object_id,
+                                "Calculated #{@paths_num} paths")
+      end
+      current_node = { taxon.id => {} }
+      if DarwinCore.nil_field?(taxon.parent_id)
+        taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
+        taxon.classification_path_id << taxon.id
+        @tree.merge!(current_node)
+      else
+        parent_cp = parent_cpid = nil
+        if @normalized_data[taxon.parent_id]
+          parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
+          parent_cpid = @normalized_data[taxon.parent_id].
+                        classification_path_id
+        else
+          current_parent = @normalized_data[@synonyms[taxon.parent_id]]
+          if current_parent
+            @error_names << { data: taxon,
+                              error: :deprecated_parent,
+                              current_parent: current_parent }
+            parent_cp = current_parent.classification_path if @with_canonical_names
+            parent_cpid = current_parent.classification_path_id
+          else
+            @error_names << { data: taxon,
+                              error: :deprecated_parent,
+                              current_parent: nil }
+          end
+        end
+        return "error" unless parent_cpid
+        if parent_cpid.empty?
+          res = "error"
+          begin
+            res = get_classification_path(@normalized_data[taxon.parent_id])
+          rescue SystemStackError
+            @error_names << { data: taxon,
+                              error: :too_deep_hierarchy,
+                              current_parent: nil }
+          end
+          return res if res == "error"
+          if @with_canonical_names
+            taxon.classification_path += @normalized_data[taxon.parent_id].
+                                         classification_path +
+                                         [taxon.current_name_canonical]
+          end
+          taxon.classification_path_id += @normalized_data[taxon.parent_id].
+                                          classification_path_id + [taxon.id]
+          parent_node = @normalized_data[taxon.parent_id].
+                        classification_path_id.inject(@tree) do |node, id|
+                          node[id]
+                        end
+          parent_node.merge!(current_node)
+        else
+          if @with_canonical_names
+            taxon.classification_path += parent_cp +
+                                         [taxon.current_name_canonical]
+          end
+          taxon.classification_path_id += parent_cpid + [taxon.id]
+          parent_node = @normalized_data[taxon.parent_id].
+                        classification_path_id.inject(@tree) do |node, id|
+            node[id]
+          end
+          begin
+            parent_node.merge!(current_node)
+          rescue NoMethodError => e
+            DarwinCore.logger_write(@dwc.object_id,
+                                    "Error '#{e.message}' taxon #{taxon.id}")
+            "error"
+          end
+        end
+      end
+    end
+    def ingest_extensions
+      @extensions.each do |e|
+        _ext, fields = *e
+        ingest_synonyms(e) if File.split(e[0].file_path).
+                              last.match(/synonym/i) &&
+                              fields.keys.include?(:scientificname)
+        ingest_vernaculars(e) if fields.keys.include? :vernacularname
+      end
+    end
+    def ingest_synonyms(extension)
+      DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
+      ext, fields = *extension
+      ext.read do |rows|
+        rows[0].each do |r|
+          synonym = process_synonym(r, fields)
+          add_synonym(synonym, r, fields)
+        end
+      end
+    end
+    def add_synonym(synonym, record, fields)
+      if @normalized_data[record[fields[:id]]]
+        @normalized_data[record[fields[:id]]].synonyms << synonym
+        add_name_string(synonym.name)
+        add_name_string(synonym.canonical_name)
+      else
+        @error_names << { taxon: synonym,
+                          error: :synonym_of_unknown_taxa }
+      end
+    end
+    def process_synonym(record, fields)
+      set_scientific_name(record, fields)
+      SynonymNormalized.new(
+        nil,
+        record[fields[:scientificname]],
+        record[fields[:canonicalname]],
+        fields[:taxonomicstatus] ? record[fields[:taxonomicstatus]] : nil,
+        fields[:source] ? record[fields[:source]] : nil,
+        fields[:localid] ? record[fields[:localid]] : nil,
+        fields[:globalid] ? record[fields[:globalid]] : nil
+      )
+    end
+    def ingest_vernaculars(extension)
+      DarwinCore.logger_write(@dwc.object_id,
+                              "Ingesting vernacular names extension")
+      ext, fields = *extension
+      ext.read do |rows|
+        rows[0].each do |row|
+          extract_vernaculars_from_row(row, fields)
+        end
+      end
+    end
+    def extract_vernaculars_from_row(row, fields)
+      language = find_vernacular_language(row, fields)
+      locality = fields[:locality] ? row[fields[:locality]] : nil
+      country_code = fields[:countrycode] ? row[fields[:countrycode]] : nil
+      vernacular = VernacularNormalized.new(
+        row[fields[:vernacularname]], language, locality, country_code
+      )
+      if @normalized_data[row[fields[:id]]]
+        @normalized_data[row[fields[:id]]].vernacular_names << vernacular
+        add_vernacular_name_string(vernacular.name)
+      else
+        @error_names << { vernacular_name: vernacular,
+                          error: :vernacular_of_unknown_taxa }
+      end
+    end
+    def find_vernacular_language(row, fields)
+      (fields[:language] && row[fields[:language]]) ||
+        (fields[:languagecode] && row[fields[:languagecode]]) || nil
+    end
+    # Collect linnean classification path only on species level
+    def get_linnean_classification_path(row, _taxon)
+      %i[kingdom phylum class order family genus
+         subgenus].each_with_object([]) do |clade, res|
+        res << [row[@core_fields[clade]], clade] if @core_fields[clade]
+      end
+    end
+    def gnub_archive?
+      @core_fields[:originalnameusageidpath]
+    end
+  end
+end