RubyGems - dwca_hunter - Versions diffs - 0.5.1 → 0.7.0 - Mend

dwca_hunter 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/.byebug_history +45 -0
data/.gitignore +5 -0
data/.rubocop.yml +3 -2
data/.ruby-version +1 -1
data/Gemfile.lock +61 -83
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/dwca_hunter.gemspec +9 -9
data/exe/dwcahunter +1 -3
data/lib/dwca_hunter.rb +39 -8
data/lib/dwca_hunter/resource.rb +5 -0
data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
data/lib/dwca_hunter/resources/arctos.rb +121 -145
data/lib/dwca_hunter/resources/clements.rb +151 -0
data/lib/dwca_hunter/resources/eol.rb +85 -0
data/lib/dwca_hunter/resources/freebase.rb +51 -49
data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
data/lib/dwca_hunter/resources/ipni.rb +111 -0
data/lib/dwca_hunter/resources/itis.rb +99 -99
data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
data/lib/dwca_hunter/resources/mcz.rb +123 -0
data/lib/dwca_hunter/resources/ncbi.rb +22 -23
data/lib/dwca_hunter/resources/opentree.rb +5 -5
data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
data/lib/dwca_hunter/resources/sherborn.rb +91 -0
data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
data/lib/dwca_hunter/version.rb +1 -1
metadata +46 -40
data/files/birdlife_7.csv +0 -11862
data/files/fishbase_taxon_cache.tsv +0 -81000
data/files/reptile_checklist_2014_12.csv +0 -15158
data/files/species-black.txt +0 -251

data/lib/dwca_hunter/resources/paleodb_harvester.rb ADDED

@@ -0,0 +1,140 @@
+class PaleodbHarvester
+  def initialize(download_dir)
+    @dir = File.join(download_dir, "json")
+    FileUtils.mkdir_p(@dir)
+    @in_dir = download_dir
+    @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
+    @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
+    @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
+    @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
+  end
+  def taxa
+    # "orig_no","taxon_no","record_type","flags","taxon_rank",
+    # "taxon_name","difference","accepted_no","accepted_rank",
+    # "accepted_name","parent_no","reference_no","is_extant","n_occs"
+    taxa = {}
+    name2id = {}
+    @taxa_csv.each do |r|
+      r = strip(r)
+      taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
+                              rank: r["taxon_rank"], name: r["taxon_name"],
+                              auth: r["taxon_attr"],
+                              extinct: extinct(r["is_extant"]),
+                              vernacular: r["common_name"],
+                              annot: r["difference"], acc_id: r["accepted_no"],
+                              acc_rank: r["accepted_rank"],
+                              acc_name: r["accepted_name"], ecol: ecol(r),
+                              parent_id: r["parent_no"], ref: r["reference_no"],
+                              occs_num: r["n_occs"], enterer: enterer(r) }
+      name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
+    end
+    f = open(File.join(@dir, "taxa.json"), "w:utf-8")
+    f.write(JSON.pretty_generate(taxa))
+    f.close
+    f = open(File.join(@dir, "name_id.json"), "w:utf-8")
+    f.write(JSON.pretty_generate(name2id))
+    f.close
+  end
+  def enterer(r)
+    res = [r["enterer"], r["modifier"]].map(&:to_s)
+      .map(&:strip).uniq.select { |e| e != "" }
+    res.empty? ? "" : res.join(", ")
+  end
+  def extinct(val)
+    val == "extinct" ? 1 : 0
+  end
+  def ecol(row)
+    row = strip row
+    "#{row['life_habit']} #{row['diet']}"
+  end
+  def refs
+    # "reference_no","record_type","ref_type","author1init","author1last",
+    # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
+    # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
+    # "language","doi"
+    # {"id":31671,"orig":true,"author":"Hahn, C. W.",
+    #  "year":1834,"title":"Die wanzenartigen Insecten.",
+    #  "details":"C. H. Zeh, Nurnberg.  2: 33--120.",
+    #  "distribution":"Germany","comment":"n. sp."}
+    refs = {}
+    @refs_csv.each do |r|
+      r = strip r
+      authorship, author = authors(r)
+      refs[r["reference_no"]] = { id: r["reference_no"], author: author,
+                                  authorship: authorship,
+                                  year: r["pubyr"],  title: r["reftitle"],
+                                  details: details(r) }
+    end
+    f = open(File.join(@dir, "refs.json"), "w:utf-8")
+    f.write(JSON.pretty_generate(refs))
+    f.close
+  end
+  def authors(row)
+    row = strip row
+    au = ["#{row['author1init']} #{row['author1last']}".strip,
+          "#{row['author2init']} #{row['author2last']}".strip,
+          "#{row['otherauthors']}".strip]
+    au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
+    [au[0..1].join(", "), au.join(", ")]
+  end
+  def details(row)
+    row = strip row
+    ref = "#{row['pubtitle']}"
+    ref << " #{row['pubno']}" unless row['pubno'].empty?
+    ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
+    ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
+    ref << " (#{row["doi"]})" unless row['doi'].empty?
+    ref.gsub(/[\s]{2,}/, " ").strip
+  end
+  def taxa_refs
+    tr = {}
+    @taxa_refs_csv.each do |r|
+      r = strip r
+      row = { acc_id: r["accepted_no"], name: r["accepted_name"],
+              ref_id: r["reference_no"] }
+      if tr.key? r["accepted_no"]
+        tr[r["accepted_no"]] << row
+      else
+        tr[r["accepted_no"]] = [row]
+      end
+    end
+    f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
+    f.write(JSON.pretty_generate(tr))
+    f.close
+  end
+  def occurences
+    occ = {}
+    @occurences_csv.each_with_index do |r, i|
+      r = strip r
+      row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
+              state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
+      if occ.key? r["accepted_no"]
+        occ[r["accepted_no"]] << row
+      else
+        occ[r["accepted_no"]] = [row]
+      end
+    end
+    f = open(File.join(@dir, "occurences.json"), "w:utf-8")
+    f.write(JSON.pretty_generate(occ))
+    f.close
+  end
+  def strip(row)
+    row.each_with_object({}) do |(k, v), h|
+      h[k] = v.nil? ? nil : v.strip
+    end
+  end
+end

data/lib/dwca_hunter/resources/sherborn.rb ADDED

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceSherborn < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "sherborn"
+      @title = "Index Animalium"
+      @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
+      @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "sherborn",
+                                 "data.csv")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading."
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      dupes = {}
+      @names_index = {}
+      file = CSV.open(File.join(@download_dir, "data.csv"),
+                      headers: false, col_sep: "\t")
+      file.each_with_index do |row, i|
+        next if dupes.key?(row[1])
+        dupes[row[1]] = true
+        taxon_id = row[0]
+        name_string = row[1]
+        @names << { taxon_id: taxon_id,
+                    name_string: name_string }
+        puts "Processed %s names" % i if i % 10_000 == 0
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string], "ICZN"]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "Charles Davies",
+            last_name: "Sherborn" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "Index Animalium is a monumental work that covers " \
+                  "400 000 zoological names registered by science " \
+                  "between 1758 and 1850",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/wikispecies.rb CHANGED

@@ -1,18 +1,17 @@
-# encoding: utf-8
+# frozen_string_literal: true
 module DwcaHunter
   class ResourceWikispecies < DwcaHunter::Resource
     def initialize(opts = {})
-      @problems_file = open('problems.txt', 'w:utf-8')
+      @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
+      @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
       @command = "wikispecies"
-      @title = 'Wikispecies'
-      @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
-             'specieswiki-latest-pages-articles.xml.bz2'
+      @title = "Wikispecies"
+      @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
+             "specieswiki-latest-pages-articles.xml.bz2"
       @url = opts[:url] if opts[:url]
-      @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
-      @download_path = File.join(Dir.tmpdir,
-                                 'dwca_hunter',
-                                 'wikispecies',
-                                 'data.xml.bz2')
+      @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
+      @download_path = File.join(@wikisp_path, "data.xml.bz2")
       @data = []
       @templates = {}
       @taxon_ids = {}
@@ -21,7 +20,7 @@ module DwcaHunter
       @extensions = []
       @re = {
         page_start: /^\s*\<page\>\s*$/,
-        page_end: /^\s*\<\/page\>\s*$/,
+        page_end: %r{^\s*\</page\>\s*$},
         template: /Template:/i,
         template_link: /\{\{([^\}]*)\}\}/,
         vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
@@ -29,6 +28,11 @@ module DwcaHunter
       super(opts)
     end
+    def download
+      puts "Downloading from the source"
+      `curl -L #{@url} -o #{@download_path}`
+    end
     def unpack
       unpack_bz2
     end
@@ -39,22 +43,22 @@ module DwcaHunter
       generate_dwca
     end
-  private
+    private
     def enrich_data
-      DwcaHunter::logger_write(self.object_id,
-                               'Extracting data from xml file...')
+      DwcaHunter.logger_write(object_id,
+                              "Extracting data from xml file...")
       Dir.chdir(@download_dir)
-      f = open('data.xml', 'r:utf-8')
+      f = open("data.xml", "r:utf-8")
       page_on = false
-      page = ''
+      page = ""
       page_num = 0
       f.each do |l|
         if l.match(@re[:page_start])
-          page << l
+          page += l
           page_on = true
         elsif page_on
-          page << l
+          page += l
           if l.match(@re[:page_end])
             page_on = false
             page_xml = Nokogiri::XML.parse(page)
@@ -63,22 +67,22 @@ module DwcaHunter
               process_species(page_xml)
             page_num += 1
             if page_num % BATCH_SIZE == 0
-              DwcaHunter::logger_write(self.object_id,
-                                       "Traversed %s pages" % page_num)
+              DwcaHunter.logger_write(object_id,
+                                      "Traversed %s pages" % page_num)
             end
-            page = ''
+            page = ""
             @page_title = nil
             @page_id = nil
           end
         end
       end
-      DwcaHunter::logger_write(self.object_id,
-                               'Extracted total %s pages' % page_num)
+      DwcaHunter.logger_write(object_id,
+                              "Extracted total %s pages" % page_num)
       f.close
     end
     def extend_classification
-      DwcaHunter::logger_write(self.object_id, 'Extending classifications')
+      DwcaHunter.logger_write(object_id, "Extending classifications")
       @data.each_with_index do |d, i|
         unless d[:classificationPath].empty?
           n = 50
@@ -100,19 +104,21 @@ module DwcaHunter
         # d[:classificationPath] = d[:classificationPath].join("|").
         # gsub("Main Page", "Life")
         if i % BATCH_SIZE == 0 && i > 0
-          DwcaHunter::logger_write(self.object_id,
-                                   "Extended %s classifications" % i)
+          DwcaHunter.logger_write(object_id,
+                                  "Extended %s classifications" % i)
         end
       end
     end
     def update_tree(path)
       path = path.dup
-      return if @paths.has_key?(path.join('|'))
+      return if @paths.key?(path.join("|"))
       (0...path.size).each do |i|
         subpath = path[0..i]
-        subpath_string = subpath.join('|')
-        next if @paths.has_key?(subpath_string)
+        subpath_string = subpath.join("|")
+        next if @paths.key?(subpath_string)
         name = subpath.pop
         tree_element = subpath.inject(@tree) { |res, n| res[n] }
         tree_element[name] = {}
@@ -121,27 +127,29 @@ module DwcaHunter
     end
     def process_template(x)
-      name = page_title(x).gsub!(@re[:template], '').strip
-      text = x.xpath('//text').text.strip
+      name = page_title(x).gsub!(@re[:template], "").strip
+      text = x.xpath("//text").text.strip
       parent_name = text.match(@re[:template_link])
       if parent_name
         return if parent_name[1].match(/\#if/)
         list = parent_name[1].split("|")
-        if list.size == 1
-          parent_name = list[0]
-        elsif list[0].match /Taxonav/i
-          parent_name = list[1]
-        else
-          parent_name = list[0]
-        end
+        parent_name = if list.size == 1
+                        list[0]
+                      elsif list[0].match(/Taxonav/i)
+                        list[1]
+                      else
+                        list[0]
+                      end
       end
-      name.gsub!(/_/, ' ')
-      parent_name.gsub!(/_/, ' ') if parent_name
+      name.gsub!(/_/, " ")
+      parent_name&.gsub!(/_/, " ")
       @templates[name] = { parentName: parent_name, id: page_id(x) }
     end
     def process_species(x)
       return if page_title(x).match(/Wikispecies/i)
       items = find_species_components(x)
       if items
         @data << {
@@ -149,7 +157,8 @@ module DwcaHunter
           canonicalForm: page_title(x),
           scientificName: page_title(x),
           classificationPath: [],
-          vernacularNames: [] }
+          vernacularNames: []
+        }
         get_full_scientific_name(items)
         get_vernacular_names(items)
         init_classification_path(items)
@@ -157,8 +166,8 @@ module DwcaHunter
     end
     def get_full_scientific_name(items)
-      if items['name']
-        if name = items['name'][0]
+      if items["name"]
+        if name = items["name"][0]
           @data[-1][:scientificName] = parse_name(name, @data[-1])
         else
           @problems_file.write("%s\n" % @data[-1][:canonicalForm])
@@ -167,19 +176,20 @@ module DwcaHunter
     end
     def get_vernacular_names(items)
-      if items['vernacular names'] && items['vernacular names'].size > 0
-        vn_string = items['vernacular names'].join("")
+      if items["vernacular names"] && !items["vernacular names"].empty?
+        vn_string = items["vernacular names"].join("")
         vn = vn_string.match(@re[:vernacular_names])
         if vn
           vn_list = vn[1].strip.split("|")
           vnames = []
           vn_list.each do |item|
-            language, name = item.split("=").map { |x| x.strip }
-            if language && name && language.size < 4 && name.valid_encoding?
-              vnames << {
-                name: name,
-                language: language }
-            end
+            language, name = item.split("=").map(&:strip)
+            next unless language && name && language.size < 4 && name.valid_encoding?
+            vnames << {
+              name: name,
+              language: language
+            }
           end
           @data[-1][:vernacularNames] = vnames
@@ -188,26 +198,26 @@ module DwcaHunter
     end
     def init_classification_path(items)
-      if items['taxonavigation']
-        items['taxonavigation'].each do |line|
-          line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
-          if template_link = line.match(@re[:template_link])
-            template_link = template_link[1].
-              strip.gsub(/Template:/, '').gsub(/_/, ' ')
-            if !template_link.match(/\|/)
-              @data[-1][:classificationPath] << template_link
-              break
-            end
-          end
+      # ignore non-template links
+      items["taxonavigation"]&.each do |line|
+        line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
+        next unless template_link = line.match(@re[:template_link])
+        template_link = template_link[1].
+                        strip.gsub(/Template:/, "").gsub(/_/, " ")
+        unless template_link.match(/\|/)
+          @data[-1][:classificationPath] << template_link
+          break
         end
       end
     end
     def find_species_components(x)
-      items = get_items(x.xpath('//text').text)
-      is_taxon_item = items.has_key?('name') ||
-                      items.has_key?('taxonavigation')
+      items = get_items(x.xpath("//text").text)
+      is_taxon_item = items.key?("name") ||
+                      items.key?("taxonavigation")
       return nil unless is_taxon_item
       items
     end
@@ -216,7 +226,7 @@ module DwcaHunter
       items = {}
       current_item = nil
       txt.split("\n").each do |l|
-        item =  l.match(/[\=]+([^\=]+)[\=]+/)
+        item = l.match(/[\=]+([^\=]+)[\=]+/)
         if item
           current_item = item[1].strip.downcase
           items[current_item] = []
@@ -228,11 +238,11 @@ module DwcaHunter
     end
     def page_title(x)
-      @page_title ||= x.xpath('//title').first.text
+      @page_title ||= x.xpath("//title").first.text
     end
     def page_id(x)
-      @page_id ||= x.xpath('//id').first.text
+      @page_id ||= x.xpath("//id").first.text
     end
     def template?(page_xml)
@@ -240,110 +250,113 @@ module DwcaHunter
     end
     def parse_name(name_string, taxa)
-      name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
+      name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
       name_string = name_string.strip
       old_l = name_string.dup
-      name_string.gsub! /^\*\s*/, ''
+      name_string.gsub!(/^\*\s*/, "")
       name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
       name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
-      name_string.gsub!(/[']{2,}/, ' ')
-      name_string.gsub!(/["]{2,}/, ' ')
-      name_string.gsub!(/\:\s*\d.*$/, '')
-      name_string.gsub!(/,\s*\[RSD\]/i, '')
-      name_string.gsub!(/^\s*†\s*/, '')
-      name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
+      name_string.gsub!(/[']{2,}/, " ")
+      name_string.gsub!(/["]{2,}/, " ")
+      name_string.gsub!(/\:\s*\d.*$/, "")
+      name_string.gsub!(/,\s*\[RSD\]/i, "")
+      name_string.gsub!(/^\s*†\s*/, "")
+      name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
       # name_string = DwcaHunter::XML.unescape(name_string)
-      name_string.gsub!(/\<nowiki\>.*$/, '')
-      name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
-      name_string.gsub!(/^\s*\&dagger;\s*/, '')
-      name_string.gsub!(/&nbsp;/, ' ')
-      name_string.gsub!(/\s+/, ' ')
+      name_string.gsub!(/\<nowiki\>.*$/, "")
+      name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
+      name_string.gsub!(/^\s*\&dagger;\s*/, "")
+      name_string.gsub!(/&nbsp;/, " ")
+      name_string.gsub!(/\s+/, " ")
       name_string = name_string.strip
       # puts "%s---%s" % [name_string, old_l]
-      return name_string
+      name_string
     end
     def generate_dwca
-      DwcaHunter::logger_write(self.object_id,
-                               'Creating DarwinCore Archive file')
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
       @core = [
-        ['http://rs.tdwg.org/dwc/terms/taxonID',
-         'http://rs.tdwg.org/dwc/terms/scientificName',
-         'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
-         'http://globalnames.org/terms/canonicalForm',
-         'http://rs.tdwg.org/dwc/terms/higherClassification',
-         'http://purl.org/dc/terms/source']
+        ["http://rs.tdwg.org/dwc/terms/taxonID",
+         "http://rs.tdwg.org/dwc/terms/scientificName",
+         "http://globalnames.org/terms/canonicalForm",
+         "http://purl.org/dc/terms/source"]
       ]
-      DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
+      DwcaHunter.logger_write(object_id, "Assembling Core Data")
       count = 0
       @data.map do |d|
         count += 1
         if count % BATCH_SIZE == 0
-          DwcaHunter::logger_write(self.object_id,
-                                   "Traversing %s core data record" % count)
+          DwcaHunter.logger_write(object_id,
+                                  "Traversing %s core data record" % count)
         end
-        taxon_id = (d[:classificationPath].empty? ?
-                    d[:taxonId] :
-                    @templates[d[:classificationPath].
-                      last][:id]) rescue d[:taxonId]
+        taxon_id = begin
+                     (d[:classificationPath].empty? ?
+                                         d[:taxonId] :
+                                         @templates[d[:classificationPath].
+                                           last][:id])
+                   rescue StandardError
+                     d[:taxonId]
+                   end
         @taxon_ids[d[:taxonId]] = taxon_id
-        parentNameUsageId = (d[:classificationPath].size > 1 ?
-                             @templates[d[:classificationPath][-2]][:id] :
-                             nil) rescue nil
-        url = 'http://species.wikimedia.org/wiki/' +
-          URI.encode(d[:canonicalForm].gsub(' ', '_'))
+        parentNameUsageId = begin
+                              (d[:classificationPath].size > 1 ?
+                                                           @templates[d[:classificationPath][-2]][:id] :
+                                                           nil)
+                            rescue StandardError
+                              nil
+                            end
+        url = "http://species.wikimedia.org/wiki/" +
+              URI.encode(d[:canonicalForm].gsub(" ", "_"))
         path = d[:classificationPath]
         path.pop if path[-1] == d[:canonicalForm]
-        canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
-        scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
+        canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
+        scientific_name = d[:scientificName] == d[:canonicalForm] ?
                            canonical_form :
                            d[:scientificName]
         @core << [taxon_id,
                   scientific_name,
-                  parentNameUsageId,
                   canonical_form,
-                  path.join('|'),
                   url]
       end
       @extensions << { data: [[
-        'http://rs.tdwg.org/dwc/terms/TaxonID',
-        'http://rs.tdwg.org/dwc/terms/vernacularName',
-        'http://purl.org/dc/terms/language'
-      ]], file_name: 'vernacular_names.txt' }
-      DwcaHunter::logger_write(self.object_id,
-              'Creating verncaular name extension for DarwinCore Archive file')
+        "http://rs.tdwg.org/dwc/terms/TaxonID",
+        "http://rs.tdwg.org/dwc/terms/vernacularName",
+        "http://purl.org/dc/terms/language"
+      ]], file_name: "vernacular_names.txt" }
+      DwcaHunter.logger_write(object_id,
+                              "Creating verncaular name extension for DarwinCore Archive file")
       count = 0
       @data.each do |d|
         count += 1
         if count % BATCH_SIZE == 0
-          DwcaHunter::logger_write(self.object_id,
-                                 "Traversing %s extension data record" % count)
+          DwcaHunter.logger_write(object_id,
+                                  "Traversing %s extension data record" % count)
         end
         d[:vernacularNames].each do |vn|
-          taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
-          if taxon_id
-            @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
-          end
+          taxon_id = @taxon_ids[d[:taxonId]] || nil
+          @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
         end
       end
       @eml = {
         id: @uuid,
         title: @title,
-        license: 'http://creativecommons.org/licenses/by-sa/3.0/',
+        license: "http://creativecommons.org/licenses/by-sa/3.0/",
         authors: [
-          { first_name: 'Stephen',
-            last_name: 'Thorpe',
-            email: 'stephen_thorpe@yahoo.co.nz',
-            url: 'http://species.wikimedia.org/wiki/Main_Page' }],
-        abstract: 'The free species directory that anyone can edit.',
+          { first_name: "Stephen",
+            last_name: "Thorpe",
+            email: "stephen_thorpe@yahoo.co.nz",
+            url: "http://species.wikimedia.org/wiki/Main_Page" }
+        ],
+        abstract: "The free species directory that anyone can edit.",
         metadata_providers: [
-          { first_name: 'Dmitry',
-            last_name: 'Mozzherin',
-            email: 'dmozzherin@mbl.edu' }],
-        url: 'http://species.wikimedia.org/wiki/Main_Page'
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@mbl.edu" }
+        ],
+        url: "http://species.wikimedia.org/wiki/Main_Page"
       }
       super
     end
   end
 end