RubyGems - dwca_hunter - Versions diffs - 0.5.1 → 0.7.0 - Mend

dwca_hunter 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/.byebug_history +45 -0
data/.gitignore +5 -0
data/.rubocop.yml +3 -2
data/.ruby-version +1 -1
data/Gemfile.lock +61 -83
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/dwca_hunter.gemspec +9 -9
data/exe/dwcahunter +1 -3
data/lib/dwca_hunter.rb +39 -8
data/lib/dwca_hunter/resource.rb +5 -0
data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
data/lib/dwca_hunter/resources/arctos.rb +121 -145
data/lib/dwca_hunter/resources/clements.rb +151 -0
data/lib/dwca_hunter/resources/eol.rb +85 -0
data/lib/dwca_hunter/resources/freebase.rb +51 -49
data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
data/lib/dwca_hunter/resources/ipni.rb +111 -0
data/lib/dwca_hunter/resources/itis.rb +99 -99
data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
data/lib/dwca_hunter/resources/mcz.rb +123 -0
data/lib/dwca_hunter/resources/ncbi.rb +22 -23
data/lib/dwca_hunter/resources/opentree.rb +5 -5
data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
data/lib/dwca_hunter/resources/sherborn.rb +91 -0
data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
data/lib/dwca_hunter/version.rb +1 -1
metadata +46 -40
data/files/birdlife_7.csv +0 -11862
data/files/fishbase_taxon_cache.tsv +0 -81000
data/files/reptile_checklist_2014_12.csv +0 -15158
data/files/species-black.txt +0 -251

data/lib/dwca_hunter/resources/eol.rb ADDED

@@ -0,0 +1,85 @@
+require "xz"
+module DwcaHunter
+  # Resource for FishBase
+  class ResourceEOL < DwcaHunter::Resource
+    attr_reader :title, :abbr
+    def initialize(opts = {}) #download: false, unpack: false})
+      @command = "eol"
+      @title = "Encyclopedia of Life"
+      @abbr = "EOL"
+      @url = "https://eol.org/data/provider_ids.csv.gz"
+      @uuid = "dba5f880-a40d-479b-a1ad-a646835edde4"
+      @download_dir = File.join(Dir.tmpdir, "dwca_hunter", "eol")
+      @download_path = File.join(@download_dir, "eol.csv.gz")
+      @extensions = []
+      super
+    end
+    def unpack
+      unpack_gzip
+    end
+    def make_dwca
+      organize_data
+      generate_dwca
+    end
+    private
+    def organize_data
+      DwcaHunter::logger_write(self.object_id,
+                               "Organizing data")
+      # snp = ScientificNameParser.new
+      @data = CSV.open(@download_path[0...-3],
+         col_sep: ",", headers: true)
+        .each_with_object([]) do |row, data|
+        id = row['page_id'].strip
+        name = row['preferred_canonical_for_page'].strip
+        data << { taxon_id: id,
+                  local_id: id,
+                  scientific_name: name}
+      end
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      core_init
+      eml_init
+      DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
+      count = 0
+      @data.each do |d|
+        count += 1
+        if count % 10000 == 0
+          DwcaHunter::logger_write(self.object_id, "Core row #{count}")
+        end
+        @core << [d[:taxon_id], d[:local_id],
+                  d[:scientific_name]]
+      end
+      super
+    end
+    def eml_init
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+          }
+      ],
+        abstract: "Global access to knowledge about life on Earth",
+        url: "http://www.eol.org"
+      }
+    end
+    def core_init
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://globalnames.org/terms/localID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                ]]
+    end
+  end
+end

data/lib/dwca_hunter/resources/freebase.rb CHANGED

@@ -1,15 +1,15 @@
-# encoding: utf-8
+# frozen_string_literal: true
 module DwcaHunter
   class ResourceFreebase < DwcaHunter::Resource
     def initialize(opts = {})
       @command = "freebase"
-      @title = 'Freebase'
-      @uuid = 'bacd21f0-44e0-43e2-914c-70929916f257'
+      @title = "Freebase"
+      @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
       @download_path = File.join(Dir.tmpdir,
-                                 'dwca_hunter',
-                                 'freebase',
-                                 'data.json')
+                                 "dwca_hunter",
+                                 "freebase",
+                                 "data.json")
       @data = []
       @all_taxa = {}
       @cleaned_taxa = {}
@@ -27,11 +27,11 @@ module DwcaHunter
     end
     def download
-      DwcaHunter::logger_write(self.object_id,
-                               'Querying freebase for species information...')
+      DwcaHunter.logger_write(object_id,
+                              "Querying freebase for species information...")
       q = {
         query: [{
-          type: '/biology/organism_classification',
+          type: "/biology/organism_classification",
           id: nil,
           guid: nil,
           name: nil,
@@ -41,16 +41,16 @@ module DwcaHunter
             id: nil,
             guid: nil,
             scientific_name: nil,
-            optional: true,
-          },
+            optional: true
+          }
         }],
-        cursor: true,
+        cursor: true
       }
       run_query(q)
       data = JSON.pretty_generate @data
-      f = open(@download_path, 'w:utf-8')
+      f = open(@download_path, "w:utf-8")
       f.write(data)
       f.close
     end
@@ -60,31 +60,32 @@ module DwcaHunter
     def run_query(q)
       count = 0
       requests_num = 0
-      while true
+      loop do
         freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
-          URI.encode(q.to_json)
+                       URI.encode(q.to_json)
         res = JSON.load RestClient.get(freebase_url)
         requests_num += 1
-        break if res['result'] == nil || res['result'].empty?
+        break if res["result"].nil? || res["result"].empty?
         if requests_num % 10 == 0
-          DwcaHunter::logger_write(self.object_id,
-                                   "Received %s names" % count)
+          DwcaHunter.logger_write(object_id,
+                                  "Received %s names" % count)
         end
-        count += res['result'].size
-        res['result'].each { |d| @data << d }
-        q[:cursor] = res['cursor']
+        count += res["result"].size
+        res["result"].each { |d| @data << d }
+        q[:cursor] = res["cursor"]
       end
     end
     def organize_data
-      @data = JSON.load(open(@download_path, 'r:utf-8').read)
+      @data = JSON.load(open(@download_path, "r:utf-8").read)
       @data.each do |d|
-        scientific_name = d['scientific_name'].to_s
+        scientific_name = d["scientific_name"].to_s
         id = d["id"]
-        parent_id = d['higher_classification'] ?
-                    d['higher_classification']["id"] :
+        parent_id = d["higher_classification"] ?
+                    d["higher_classification"]["id"] :
                     nil
-        synonyms = d['synonym_scientific_name']
+        synonyms = d["synonym_scientific_name"]
         @all_taxa[id] = { id: id,
                           parent_id: parent_id,
                           scientific_name: scientific_name,
@@ -93,6 +94,7 @@ module DwcaHunter
       @all_taxa.each do |k, v|
         next unless v[:scientific_name] && v[:scientific_name].strip != ""
         parent_id = v[:parent_id]
         until (@all_taxa[parent_id] &&
                 @all_taxa[parent_id][:scientific_name]) || parent_id.nil?
@@ -103,29 +105,28 @@ module DwcaHunter
         v[:parent_id] = parent_id
         @cleaned_taxa[k] = v
       end
     end
     def generate_dwca
-      DwcaHunter::logger_write(self.object_id,
-                               'Creating DarwinCore Archive file')
-      @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
-                'http://rs.tdwg.org/dwc/terms/scientificName',
-                'http://rs.tdwg.org/dwc/terms/parentNameUsageID']]
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/parentNameUsageID"]]
       @extensions << { data: [[
-        'http://rs.tdwg.org/dwc/terms/TaxonID',
-        'http://rs.tdwg.org/dwc/terms/scientificName',
-      ]], file_name: 'synonyms.txt' }
-      DwcaHunter::logger_write(self.object_id,
-                    'Creating synonyms extension for DarwinCore Archive file')
+        "http://rs.tdwg.org/dwc/terms/TaxonID",
+        "http://rs.tdwg.org/dwc/terms/scientificName"
+      ]], file_name: "synonyms.txt" }
+      DwcaHunter.logger_write(object_id,
+                              "Creating synonyms extension for DarwinCore Archive file")
       count = 0
-      @cleaned_taxa.each do |key, taxon|
+      @cleaned_taxa.each do |_key, taxon|
         count += 1
         @core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
         if count % BATCH_SIZE == 0
-          DwcaHunter::logger_write(self.object_id,
-                                 "Traversing %s extension data record" % count)
+          DwcaHunter.logger_write(object_id,
+                                  "Traversing %s extension data record" % count)
         end
         taxon[:synonyms].each do |name|
           @extensions[-1][:data] << [taxon[:id], name]
@@ -134,19 +135,20 @@ module DwcaHunter
       @eml = {
         id: @uuid,
         title: @title,
-        license: 'http://creativecommons.org/licenses/by-sa/3.0/',
+        license: "http://creativecommons.org/licenses/by-sa/3.0/",
         authors: [
-          { url: 'http://www.freebase.com/home' }],
-        abstract: 'An entity graph of people, places and things, ' +
-                  'built by a community that loves open data.',
+          { url: "http://www.freebase.com/home" }
+        ],
+        abstract: "An entity graph of people, places and things, " \
+                  "built by a community that loves open data.",
         metadata_providers: [
-          { first_name: 'Dmitry',
-            last_name: 'Mozzherin',
-            email: 'dmozzherin@mbl.edu' }],
-        url: 'http://www.freebase.com/home'
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@mbl.edu" }
+        ],
+        url: "http://www.freebase.com/home"
       }
       super
     end
   end
 end

data/lib/dwca_hunter/resources/how-moore-birds.rb ADDED

@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceHowardMoore < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "how-moore-birds"
+      @title = "Howard and Moore Complete Checklist of the Birds of the World"
+      @url = "https://uofi.box.com/shared/static/m71m541dr5unc41xzg4y51d92b7wiy2k.csv"
+      @UUID = "85023fe5-bf2a-486b-bdae-3e61cefd41fd"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "how-moore-birds",
+                                 "data.csv")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading cached verion of the file."
+      puts "Check https://www.howardandmoore.org/howard-and-moore-database/"
+      puts "If there is a more recent edition"
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      file = CSV.open(File.join(@download_dir, "data.csv"),
+                      headers: true)
+      file.each_with_index do |row, i|
+        kingdom = "Animalia"
+        phylum = "Chordata"
+        klass = "Aves"
+        family = row["FAMILY_NAME"].capitalize
+        genus = row["GENERA_NAME"].capitalize
+        species = row["SPECIES_NAME"]
+        species_au =
+          "#{row['species_author']} #{row['species_rec_year']}".strip
+        subspecies = row["SUB_SPECIES_NAME"]
+        subspecies_au =
+          "#{row['subspecies_author']} #{row['subspecies_rec_year']}".strip
+        code = "ICZN"
+        taxon_id = "gn_#{i + 1}"
+        name_string = species
+        name_string = if subspecies.to_s == "" ||
+                          name_string.include?(subspecies)
+                        "#{name_string} #{species_au}".strip
+                      else
+                        "#{name_string} #{subspecies} #{subspecies_au}".strip
+                      end
+        @names << { taxon_id: taxon_id,
+                    name_string: name_string,
+                    kingdom: kingdom,
+                    phylum: phylum,
+                    klass: klass,
+                    family: family,
+                    genus: genus,
+                    code: code }
+        if row["species_english_name"].to_s != ""
+          @vernaculars << {
+            taxon_id: taxon_id,
+            vern: row["species_english_name"],
+            lang: "en"
+          }
+        end
+        if row["species_english_name2"].to_s != ""
+          @vernaculars << {
+            taxon_id: taxon_id,
+            vern: row["species_english_name2"],
+            lang: "en"
+          }
+        end
+        puts "Processed %s names" % i if i % 10_000 == 0
+      end
+    end
+    def update_vernacular(taxon_id, canonical)
+      return unless @vernaculars_hash.key?(canonical)
+      @vernaculars_hash[canonical].each do |vern|
+        @vernaculars << { taxon_id: taxon_id, vern: vern }
+      end
+    end
+    def update_synonym(taxon_id, canonical)
+      return unless @synonyms_hash.key?(canonical)
+      @synonyms_hash[canonical].each do |syn|
+        @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
+                       status: syn[:status] }
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string],
+                  n[:kingdom], n[:phylum], n[:klass], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @extensions << {
+        data: [[
+          "http://rs.tdwg.org/dwc/terms/taxonID",
+          "http://rs.tdwg.org/dwc/terms/vernacularName",
+          "http://purl.org/dc/terms/language"
+        ]],
+        file_name: "vernacular_names.txt",
+        row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
+      }
+      @vernaculars.each do |v|
+        @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          {
+            last_name: "Christidis"
+          }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "Christidis et al. 2018. The Howard and Moore Complete " \
+        "Checklist of the Birds of the World, version 4.1 " \
+        "(Downloadable checklist). " \
+        "Accessed from https://www.howardandmoore.org.",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/ioc_word_bird.rb ADDED

@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceIOCWorldBird < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "ioc-world-bird"
+      @title = "IOC World Bird List"
+      @url = "https://uofi.box.com/shared/static/fbpuk5ghh9083nbzjdeyqtoqsvdnro01.csv"
+      @UUID = "6421ffec-38e3-40fb-a6d9-af27238a47a1"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "ioc-bird",
+                                 "data.csv")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading cached and converted to csv version."
+      puts "CHECK FOR NEW VERSION at"
+      puts "https://www.worldbirdnames.org/ioc-lists/master-list-2/"
+      puts "Use libreoffice to convert to csv."
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      @names_index = {}
+      file = CSV.open(File.join(@download_dir, "data.csv"),
+                      headers: true)
+      order = ""
+      family = ""
+      genus = ""
+      species = ""
+      count = 0
+      file.each do |row|
+        order1 = row["Order"]
+        order = order1.capitalize if order1.to_s != ""
+        family1 = row["Family (Scientific)"]
+        family = family1.capitalize if family1.to_s != ""
+        genus1 = row["Genus"]
+        genus = genus1.capitalize if genus1.to_s != ""
+        species1 = row["Species (Scientific)"]
+        species = species1 if species1.to_s != ""
+        subspecies = row["Subspecies"]
+        next if species.to_s == ""
+        count += 1
+        taxon_id = "gn_#{count}"
+        name = {
+          taxon_id: taxon_id,
+          kingdom: "Animalia",
+          phylum: "Chordata",
+          klass: "Aves",
+          order: order,
+          family: family,
+          genus: genus,
+          code: "ICZN"
+        }
+        if subspecies.to_s == ""
+          auth = row["Authority"].to_s
+          auth = DwcaHunter.normalize_authors(auth) if auth != ""
+          name[:name_string] = clean(
+            "#{genus} #{species} #{auth}".
+            strip
+          )
+          @names << name
+          vernacular = row["Species (English)"]
+          if vernacular.to_s != ""
+            vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: "en" }
+            @vernaculars << vernaclar
+          end
+          species = ""
+        else
+          name[:name_string] = clean(
+            "#{genus} #{species} #{subspecies} #{row['Authority']}".
+            strip
+          )
+          @names << name
+          species = ""
+          subspecies = ""
+        end
+      end
+    end
+    def clean(n)
+      n = n.gsub(/†/, "")
+      n.gsub(/\s+/, " ")
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @extensions << {
+        data: [[
+          "http://rs.tdwg.org/dwc/terms/taxonID",
+          "http://rs.tdwg.org/dwc/terms/vernacularName",
+          "http://purl.org/dc/terms/language"
+        ]],
+        file_name: "vernacular_names.txt",
+        row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
+      }
+      @vernaculars.each do |v|
+        @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "Per",
+            last_name: "Alstrom" },
+          { first_name: "Mike",
+            last_name: "Blair" },
+          { first_name: "Rauri",
+            last_name: "Bowie" },
+          { first_name: "Nigel",
+            last_name: "Redman" },
+          { first_name: "Jon",
+            last_name: "Fjeldsa" },
+          { first_name: "Phil",
+            last_name: "Gregory" },
+          { first_name: "Leo",
+            last_name: "Joseph" },
+          { first_name: "Peter",
+            last_name: "Kovalik" },
+          { first_name: "Adolfo",
+            last_name: "Navarro-Siguenza" },
+          { first_name: "David",
+            last_name: "Parkin" },
+          { first_name: "Alan",
+            last_name: "Peterson" },
+          { first_name: "Douglas",
+            last_name: "Pratt" },
+          { first_name: "Pam",
+            last_name: "Rasmussen" },
+          { first_name: "Frank",
+            last_name: "Rheindt" },
+          { first_name: "Robert",
+            last_name: "Ridgely" },
+          { first_name: "Peter",
+            last_name: "Ryan" },
+          { first_name: "George",
+            last_name: "Sangster" },
+          { first_name: "Dick",
+            last_name: "Schodde" },
+          { first_name: "Minturn",
+            last_name: "Wright" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "The IOC World Bird List is an open access resource of " \
+                  "the international community of ornithologists.",
+        url: "https://www.worldbirdnames.org"
+      }
+      super
+    end
+  end
+end