RubyGems - dwca_hunter - Versions diffs - 0.5.2 → 0.7.1 - Mend

dwca_hunter 0.5.2 → 0.7.1

Files changed (40) hide show

checksums.yaml +4 -4
data/.byebug_history +37 -0
data/.gitignore +5 -0
data/.rubocop.yml +3 -2
data/.ruby-version +1 -1
data/Gemfile.lock +59 -135
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/dwca_hunter.gemspec +7 -8
data/exe/dwcahunter +1 -3
data/lib/dwca_hunter.rb +39 -8
data/lib/dwca_hunter/resource.rb +5 -0
data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
data/lib/dwca_hunter/resources/arctos.rb +121 -145
data/lib/dwca_hunter/resources/clements.rb +151 -0
data/lib/dwca_hunter/resources/eol.rb +85 -0
data/lib/dwca_hunter/resources/freebase.rb +51 -49
data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
data/lib/dwca_hunter/resources/ion.rb +98 -0
data/lib/dwca_hunter/resources/ipni.rb +3 -2
data/lib/dwca_hunter/resources/itis.rb +99 -99
data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
data/lib/dwca_hunter/resources/mcz.rb +123 -0
data/lib/dwca_hunter/resources/ncbi.rb +22 -23
data/lib/dwca_hunter/resources/opentree.rb +5 -5
data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
data/lib/dwca_hunter/resources/sherborn.rb +91 -0
data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
data/lib/dwca_hunter/version.rb +1 -1
metadata +31 -40
data/files/birdlife_7.csv +0 -11862
data/files/fishbase_taxon_cache.tsv +0 -81000
data/files/reptile_checklist_2014_12.csv +0 -15158
data/files/species-black.txt +0 -251
data/ipni.csv.gz +0 -0
data/ipniWebName.csv.xz?dl=1 +0 -0

data/lib/dwca_hunter/resources/index-fungorum.rb ADDED

@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceAOS < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "index-fungorum"
+      @title = "Index Fungorum (Species Fungorum)"
+      @url = "https://uofi.box.com/shared/static/54l3b7h4q4pwqq4fgqvx42h3d328fl1c.csv"
+      @UUID = "af06816a-0b28-4a09-8219-bd1d63289858"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "index-fungorum",
+                                 "data.csv")
+      @synonyms = []
+      @names = []
+      @extensions = []
+      @synonyms_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading csv from remote"
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      @names_index = {}
+      file = CSV.open(File.join(@download_dir, "data.csv"),
+                      headers: true)
+      file.each_with_index do |row, _i|
+        taxon_id = row["RECORD NUMBER"]
+        current_id = row["CURRENT NAME RECORD NUMBER"]
+        name_string = row["NAME OF FUNGUS"]
+        authors = row["AUTHORS"]
+        year = row["YEAR OF PUBLICATION"]
+        kingdom = row["Kingdom name"]
+        phylum = row["Phylum name"]
+        sub_phylum = row["Subphylum name"]
+        klass = row["Class name"]
+        subklass = row["Subclass name"]
+        order = row["Order name"]
+        family = row["Family name"]
+        code = "ICN"
+        @names << {
+          taxon_id: taxon_id,
+          name_string: "#{name_string} #{authors} #{year}",
+          current_id: current_id,
+          kingdom: kingdom,
+          phylum: phylum,
+          klass: klass,
+          order: order,
+          family: family,
+          code: code
+        }
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string], n[:current_id],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:code]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "Paul",
+            last_name: "Kirk" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "The Index Fungorum, the global fungal nomenclator " \
+          "coordinated and supported by the Index Fungorum Partnership, " \
+          "contains names of fungi (including yeasts, lichens, chromistan " \
+          "fungal analogues, protozoan fungal analogues and fossil forms) " \
+          "at all ranks.\n\n" \
+          "As a result of changes to the ICN (previously ICBN) relating to " \
+          "registration of names and following the lead taken by MycoBank, " \
+          "Index Fungorum now provides a mechanism to register names of " \
+          "new taxa, new names, new combinations and new typifications — no " \
+          "login is required. Names registered at Index Fungorum can be " \
+          "published immediately through the Index Fungorum e-Publication " \
+          "facility — an authorized login is required for this.\n\n" \
+          "Species Fungorum is currently an RBG Kew coordinated initiative " \
+          "to compile a global checklist of the fungi. You may search " \
+          "systematically defined and taxonomically complete datasets - " \
+          "global species databases - or the entire Species Fungorum. " \
+          "Species Fungorum contributes the fungal component to the Species " \
+          "2000 project and, in partnership with ITIS, to the Catalogue " \
+          "of Life (currently used in the GBIF and EoL portal); for more " \
+          "information regarding these global initiative visit their " \
+          "websites. Please contact Paul Kirk if you you would like to " \
+          "contribute to Species Fungorum.",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/ioc_word_bird.rb ADDED

@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceIOCWorldBird < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "ioc-world-bird"
+      @title = "IOC World Bird List"
+      @url = "https://uofi.box.com/shared/static/fbpuk5ghh9083nbzjdeyqtoqsvdnro01.csv"
+      @UUID = "6421ffec-38e3-40fb-a6d9-af27238a47a1"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "ioc-bird",
+                                 "data.csv")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading cached and converted to csv version."
+      puts "CHECK FOR NEW VERSION at"
+      puts "https://www.worldbirdnames.org/ioc-lists/master-list-2/"
+      puts "Use libreoffice to convert to csv."
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      @names_index = {}
+      file = CSV.open(File.join(@download_dir, "data.csv"),
+                      headers: true)
+      order = ""
+      family = ""
+      genus = ""
+      species = ""
+      count = 0
+      file.each do |row|
+        order1 = row["Order"]
+        order = order1.capitalize if order1.to_s != ""
+        family1 = row["Family (Scientific)"]
+        family = family1.capitalize if family1.to_s != ""
+        genus1 = row["Genus"]
+        genus = genus1.capitalize if genus1.to_s != ""
+        species1 = row["Species (Scientific)"]
+        species = species1 if species1.to_s != ""
+        subspecies = row["Subspecies"]
+        next if species.to_s == ""
+        count += 1
+        taxon_id = "gn_#{count}"
+        name = {
+          taxon_id: taxon_id,
+          kingdom: "Animalia",
+          phylum: "Chordata",
+          klass: "Aves",
+          order: order,
+          family: family,
+          genus: genus,
+          code: "ICZN"
+        }
+        if subspecies.to_s == ""
+          auth = row["Authority"].to_s
+          auth = DwcaHunter.normalize_authors(auth) if auth != ""
+          name[:name_string] = clean(
+            "#{genus} #{species} #{auth}".
+            strip
+          )
+          @names << name
+          vernacular = row["Species (English)"]
+          if vernacular.to_s != ""
+            vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: "en" }
+            @vernaculars << vernaclar
+          end
+          species = ""
+        else
+          name[:name_string] = clean(
+            "#{genus} #{species} #{subspecies} #{row['Authority']}".
+            strip
+          )
+          @names << name
+          species = ""
+          subspecies = ""
+        end
+      end
+    end
+    def clean(n)
+      n = n.gsub(/†/, "")
+      n.gsub(/\s+/, " ")
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @extensions << {
+        data: [[
+          "http://rs.tdwg.org/dwc/terms/taxonID",
+          "http://rs.tdwg.org/dwc/terms/vernacularName",
+          "http://purl.org/dc/terms/language"
+        ]],
+        file_name: "vernacular_names.txt",
+        row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
+      }
+      @vernaculars.each do |v|
+        @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "Per",
+            last_name: "Alstrom" },
+          { first_name: "Mike",
+            last_name: "Blair" },
+          { first_name: "Rauri",
+            last_name: "Bowie" },
+          { first_name: "Nigel",
+            last_name: "Redman" },
+          { first_name: "Jon",
+            last_name: "Fjeldsa" },
+          { first_name: "Phil",
+            last_name: "Gregory" },
+          { first_name: "Leo",
+            last_name: "Joseph" },
+          { first_name: "Peter",
+            last_name: "Kovalik" },
+          { first_name: "Adolfo",
+            last_name: "Navarro-Siguenza" },
+          { first_name: "David",
+            last_name: "Parkin" },
+          { first_name: "Alan",
+            last_name: "Peterson" },
+          { first_name: "Douglas",
+            last_name: "Pratt" },
+          { first_name: "Pam",
+            last_name: "Rasmussen" },
+          { first_name: "Frank",
+            last_name: "Rheindt" },
+          { first_name: "Robert",
+            last_name: "Ridgely" },
+          { first_name: "Peter",
+            last_name: "Ryan" },
+          { first_name: "George",
+            last_name: "Sangster" },
+          { first_name: "Dick",
+            last_name: "Schodde" },
+          { first_name: "Minturn",
+            last_name: "Wright" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "The IOC World Bird List is an open access resource of " \
+                  "the international community of ornithologists.",
+        url: "https://www.worldbirdnames.org"
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/ion.rb ADDED

@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceION < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "ion"
+      @title = "Index to Organism Names"
+      @url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
+      @UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "ion",
+                                 "data.tar.gz")
+      @names = []
+      @extensions = []
+      super(opts)
+    end
+    def download
+      puts "Downloading cached verion of the file. Ask Rod Page to make new."
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack
+      unpack_tar
+    end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      file = CSV.open(File.join(@download_dir, "ion.tsv"),
+                      headers: true, col_sep: "\t", quote_char: "щ")
+      file.each_with_index do |row, i|
+        id = row["id"]
+        name_string = row["nameComplete"]
+        auth = row["taxonAuthor"]
+        @names << { taxon_id: id,
+                    name_string: name_string,
+                    auth: auth }
+        puts "Processed %s names" % i if i % 10_000 == 0
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string], n[:auth]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "Nigel",
+            last_name: "Robinson",
+            email: "nigel.robinson@thomsonreuters.com" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "ION contains millions of animal names, both fossil and " \
+          "recent, at all taxonomic ranks, reported from the scientific " \
+          "literature. (Bacteria, plant and virus names will be added soon)." \
+          "\n\n" \
+          "These names are derived from premier Clarivate databases: " \
+          "Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
+          "All names are tied to at least one published article. Together, " \
+          "these resources cover every aspect of the life sciences - " \
+          "providing names from over 30 million scientific records, " \
+          "including approximately ,000 international journals, patents, " \
+          "books, and conference proceedings. They provide a powerful " \
+          "foundation for the most complete collection of organism names " \
+          "available today.",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/ipni.rb CHANGED

@@ -8,7 +8,7 @@ module DwcaHunter
       @command = "ipni"
       @title = "The International Plant Names Index"
       @abbr = "IPNI"
-      @url = "https://www.dropbox.com/s/1n0sn80vkdir5nu/ipniWebName.csv.xz"
+      @url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
       @uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
       @download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
                                  "ipni.csv.xz")
@@ -22,8 +22,9 @@ module DwcaHunter
     end
     def download
-      puts "Downloading cached verion of the file. Get daily updated one from"
+      puts "Download by hand from"
       puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
+      puts "and copy to given url"
         `curl -s -L #{@url} -o #{@download_path}`
     end