RubyGems - dwca_hunter - Versions diffs - 0.5.5 → 0.7.0 - Mend

dwca_hunter 0.5.5 → 0.7.0

Files changed (32) hide show

checksums.yaml +4 -4
data/.byebug_history +37 -0
data/.gitignore +5 -0
data/.rubocop.yml +3 -2
data/.ruby-version +1 -1
data/Gemfile.lock +50 -77
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/dwca_hunter.gemspec +7 -8
data/exe/dwcahunter +1 -3
data/lib/dwca_hunter.rb +31 -0
data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
data/lib/dwca_hunter/resources/arctos.rb +93 -91
data/lib/dwca_hunter/resources/clements.rb +151 -0
data/lib/dwca_hunter/resources/freebase.rb +51 -49
data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
data/lib/dwca_hunter/resources/ipni.rb +3 -2
data/lib/dwca_hunter/resources/itis.rb +99 -99
data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
data/lib/dwca_hunter/resources/mcz.rb +123 -0
data/lib/dwca_hunter/resources/ncbi.rb +22 -23
data/lib/dwca_hunter/resources/opentree.rb +5 -5
data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
data/lib/dwca_hunter/resources/sherborn.rb +91 -0
data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
data/lib/dwca_hunter/version.rb +1 -1
metadata +27 -34
data/ipni.csv.gz +0 -0
data/ipniWebName.csv.xz?dl=1 +0 -0

data/lib/dwca_hunter/resources/mammal_divdb.rb ADDED

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceMammalDiversityDb < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "mammal-div-db"
+      @title = "ASM Mammal Diversity Database"
+      @url = "https://mammaldiversity.org/species-account/api.php?q=*"
+      @UUID = "94270cdd-5424-4bb1-8324-46ccc5386dc7"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "mammal-div-db",
+                                 "data.json")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      DwcaHunter.logger_write(object_id, "Downloading")
+      `curl '#{@url}' -H 'User-Agent:' -o #{@download_path}`
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      @names_index = {}
+      decoder = HTMLEntities.new
+      data = File.read(File.join(@download_dir, "data.json"))
+      data = JSON.parse(data, symbolize_names: true)
+      data[:result].each_with_index do |e, _i|
+        e = e[1]
+        order = e[:dwc][:order].capitalize
+        order = nil if order.match(/incertae/)
+        family = e[:dwc][:family].capitalize
+        family = nil if family.match(/incertae/)
+        genus = e[:dwc][:genus].capitalize
+        genus = nil if genus.match(/incertae/)
+        name = {
+          taxon_id: e[:internal_id],
+          kingdom: "Animalia",
+          phylum: "Chordata",
+          klass: "Mammalia",
+          order: order,
+          family: family,
+          genus: genus,
+          name_string: "#{e[:dwc][:scientificName]} " \
+          "#{e[:dwc][:scientificNameAuthorship][:species]}".strip,
+          rank: e[:dwc][:taxonRank],
+          status: e[:dwc][:taxonRank],
+          code: "ICZN"
+        }
+        if e[:dwc][:taxonomicStatus] == "accepted"
+          @names << name
+        else
+          @synonyms << name
+        end
+        vern = e[:dwc][:vernacularName]
+        next unless vern.to_s != ""
+        vern = decoder.decode(vern)
+        vernacular = {
+          taxon_id: e[:id],
+          vern: vern,
+          lang: "en"
+        }
+        @vernaculars << vernacular
+      end
+      puts data[:result].size
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @extensions << {
+        data: [[
+          "http://rs.tdwg.org/dwc/terms/taxonID",
+          "http://rs.tdwg.org/dwc/terms/vernacularName",
+          "http://purl.org/dc/terms/language"
+        ]],
+        file_name: "vernacular_names.txt",
+        row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
+      }
+      @vernaculars.each do |v|
+        @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
+      end
+      @extensions << {
+        data: [[
+          "http://rs.tdwg.org/dwc/terms/taxonID",
+          "http://rs.tdwg.org/dwc/terms/scientificName",
+          "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
+        ]],
+        file_name: "synonyms.txt"
+      }
+      @synonyms.each do |s|
+        @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "C. J.",
+            last_name: "Burgin" },
+          { first_name: "J. P.",
+            last_name: "Colella" },
+          { first_name: "P. L.",
+            last_name: "Kahn" },
+          { first_name: "N. S.",
+            last_name: "Upham" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "Mammal Diversity Database. 2020. www.mammaldiversity.org. " \
+        "American Society of Mammalogists. Accessed 2020-05-24 .",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/mammal_species.rb CHANGED

@@ -5,7 +5,7 @@ module DwcaHunter
   # to DarwinCore Archive file
   class ResourceMammalSpecies < DwcaHunter::Resource
     def initialize(opts = {})
-      @parser = ScientificNameParser.new
+      @parser = Biodiversity::Parser
       @black_sp = black_species
       @command = "mammal-species"
       @title = "The Mammal Species of The World"
@@ -99,9 +99,9 @@ module DwcaHunter
     # rubocop:enable Metrics/AbcSize
     def real_name?(str)
-      parsed = @parser.parse(str)[:scientificName]
+      parsed = @parser.parse(str)
       return false unless parsed[:parsed]
-      epithets = parsed[:canonical].split(" ")[1..-1]
+      epithets = parsed[:canonicalName][:simple].split(" ")[1..-1]
       return false if epithets.nil? || epithets.empty?
       epithets.each do |e|
         return false if @black_sp[e]

data/lib/dwca_hunter/resources/mcz.rb ADDED

@@ -0,0 +1,123 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourceMCZ < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "mcz"
+      @title = "MCZbase"
+      @url = "https://uofi.box.com/shared/static/x1dp86l48hyjkwfl106ejj25ormkzwip.gz"
+      @UUID = "c79d055b-211b-40de-8e27-618011656265"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "mcz",
+                                 "data.tar.gz")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading cached verion of the file. Ask MCZ for update."
+      `curl -s -L #{@url} -o #{@download_path}`
+    end
+    def unpack
+      unpack_tar
+    end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      collect_names
+    end
+    def collect_names
+      @names_index = {}
+      file = CSV.open(File.join(@download_dir, "taxonomy_export_2020May26.csv"),
+                      headers: true)
+      file.each_with_index do |row, i|
+        canonical = row["SCIENTIFIC_NAME"]
+        authors = row["AUTHOR_TEXT"]
+        kingdom = row["KINGDOM"]
+        phylum = row["PHYLUM"]
+        klass = row["PHYLCLASS"]
+        order = row["PHYLORDER"]
+        family = row["FAMILY"]
+        genus = row["GENUS"]
+        code = row["NOMENCLATURAL_CODE"]
+        taxon_id = "gn_#{i + 1}"
+        name_string = "#{canonical} #{authors}".strip
+        @names << { taxon_id: taxon_id,
+                    name_string: name_string,
+                    kingdom: kingdom,
+                    phylum: phylum,
+                    klass: klass,
+                    order: order,
+                    family: family,
+                    genus: genus,
+                    code: code }
+        puts "Processed %s names" % i if i % 10_000 == 0
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/kingdom",
+                "http://rs.tdwg.org/dwc/terms/phylum",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:name_string],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { first_name: "MCZ",
+            last_name: "Harvard University" }
+        ],
+        metadata_providers: [
+          { first_name: "Paul",
+            last_name: "Morris" }
+        ],
+        abstract: "The Museum of Comparative Zoology was founded in 1859 on " \
+        "the concept that collections are an integral and fundamental " \
+        "component of zoological research and teaching. This more than " \
+        "150-year-old commitment remains a strong and proud tradition for " \
+        "the MCZ. The present-day MCZ contains over 21-million specimens in " \
+        "ten research collections which comprise one of the world's richest " \
+        "and most varied resources for studying the diversity of life. The " \
+        "museum serves as the primary repository for zoological specimens " \
+        "collected by past and present Harvard faculty-curators, staff and " \
+        "associates conducting research around the world. As a premier " \
+        "university museum and research institution, the specimens and " \
+        "their related data are available to researchers of the scientific " \
+        "and museum community. doi:10.5281/zenodo.891420",
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/ncbi.rb CHANGED

@@ -1,19 +1,19 @@
-# encoding: utf-8
+# frozen_string_literal: true
 module DwcaHunter
   class ResourceNCBI < DwcaHunter::Resource
     def initialize(opts = {})
-      @command = 'ncbi'
-      @title = 'NCBI'
-      @url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
-      @uuid =  '97d7633b-5f79-4307-a397-3c29402d9311'
+      @command = "ncbi"
+      @title = "NCBI"
+      @url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
+      @uuid = "97d7633b-5f79-4307-a397-3c29402d9311"
       @download_path = File.join(Dir.tmpdir,
-                                 'dwca_hunter',
-                                 'ncbi',
-                                 'data.tar.gz')
+                                 "dwca_hunter",
+                                 "ncbi",
+                                 "data.tar.gz")
       @names = {}
       @data = []
-      @collected_names = ['genbank common name', 'common name', 'valid']
+      @collected_names = ["genbank common name", "common name", "valid"]
       @core = []
       @extensions = []
       super
@@ -33,25 +33,24 @@ module DwcaHunter
     private
     def set_vars
-      @names_file = File.join(@download_dir, 'names.dmp')
-      @nodes_file = File.join(@download_dir, 'nodes.dmp')
+      @names_file = File.join(@download_dir, "names.dmp")
+      @nodes_file = File.join(@download_dir, "nodes.dmp")
     end
     def get_names
-      DwcaHunter::logger_write(object_id, 'Collecting names...')
+      DwcaHunter.logger_write(object_id, "Collecting names...")
       open(@names_file).each_with_index do |line, i|
-        if i > 0 && i % BATCH_SIZE == 0
-          DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
-        end
-        line = line.split("|").map {|l| cleanup(l)}
+        DwcaHunter.logger_write(object_id, "Collected %s names..." % i) if i > 0 && i % BATCH_SIZE == 0
+        line = line.split("|").map { |l| cleanup(l) }
         id = line[0]
         next if id == 1
         name = line[1]
         name_type = line[3]
-        name_type = 'valid' if name_type == 'scientific name'
+        name_type = "valid" if name_type == "scientific name"
         begin
           name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
-                      gsub(/\s+/, ' ')
+                 gsub(/\s+/, " ")
         rescue NoMethodError
           puts "wrong name: %s" % name
           next
@@ -66,12 +65,11 @@ module DwcaHunter
     def get_classification
       DwcaHunter.logger_write(object_id, "Building classification...")
       open(@nodes_file, "r:utf-8").each_with_index do |line, i|
-        if i > 0 && i % BATCH_SIZE == 0
-          DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
-        end
-        line = line.split('|').map {|l| cleanup(l)}
+        DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i) if i > 0 && i % BATCH_SIZE == 0
+        line = line.split("|").map { |l| cleanup(l) }
         id = line[0]
         next if id == 1
         parent_tax_id = line[1]
         rank = line[2]
         hidden_flag = line[10]
@@ -80,6 +78,7 @@ module DwcaHunter
         rank = "" if rank == "no rank"
         parent_tax_id = nil if parent_tax_id == 1
         next unless @names[id] && @names[id]["valid"]
         vernacular_names = []
         synonyms = []
         @names[id].keys.each do |k|

data/lib/dwca_hunter/resources/opentree.rb CHANGED

@@ -34,7 +34,7 @@ module DwcaHunter
         ],
         url: @url
       }
-      @url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
+      @url = "http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"
       @download_path = File.join(Dir.tmpdir, "dwca_hunter",
                                  "opentree", "data.tar.gz")
       super
@@ -51,11 +51,11 @@ module DwcaHunter
     end
     def download
+      puts "Downloading cached data, update it at oot website!!"
       return unless @needs_download
       DwcaHunter.logger_write(object_id, "Downloading file -- "\
                                "it will take some time...")
-      dlr = DwcaHunter::Downloader.new(url, @download_path)
-      dlr.download
+      `curl -L #{url} -o #{@download_path}`
     end
     private
@@ -66,8 +66,8 @@ module DwcaHunter
     end
     def set_vars
-      @taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
-      @synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
+      @taxonomy = File.join(@download_dir, "ott3.2", "taxonomy.tsv")
+      @synonyms = File.join(@download_dir, "ott3.2", "synonyms.tsv")
     end
     def classification

data/lib/dwca_hunter/resources/paleobiodb.rb ADDED

@@ -0,0 +1,193 @@
+# frozen_string_literal: true
+module DwcaHunter
+  class ResourcePaleoBioDb < DwcaHunter::Resource
+    OCCURANCE_URL = "http://paleobiodb.org/data1.2/occs/list.txt?" \
+                    "datainfo&rowcount&base_name=Life&taxon_reso=species&" \
+                    "idqual=certain&show=ecospace,loc,paleoloc,acconly"
+    TAXA_URL = "http://paleobiodb.org/data1.2/taxa/list.txt?datainfo&" \
+              "rowcount&base_name=Life&variant=all&" \
+              "show=attr,common,app,parent,ecospace,ref,refattr,entname"
+    REFS_URL = "http://paleobiodb.org/data1.2/taxa/refs.txt?datainfo&" \
+               "rowcount&base_name=Life&select=taxonomy"
+    TAXA_REFS_URL = "http://paleobiodb.org/data1.2/taxa/byref.txt?datainfo&" \
+                    "rowcount&base_name=Life&select=taxonomy"
+    URLS = {
+      occurences: OCCURANCE_URL,
+      taxa: TAXA_URL,
+      refs: REFS_URL,
+      taxa_refs: TAXA_REFS_URL
+    }.freeze
+    def initialize(opts = {})
+      # opts = {download: false}
+      @command = "paleodb"
+      @title = "The Paleobiology Database"
+      @UUID =  "fad9970e-c358-4e1b-8cc3-f9ad2582751f"
+      @download_path = File.join(Dir.tmpdir,
+                                 "dwca_hunter",
+                                 "paleobiodb", "fake.csv")
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      @synonyms_hash = {}
+      @vernaculars_hash = {}
+      super(opts)
+    end
+    def download
+      puts "Downloading from original."
+      URLS.each do |k, v|
+        file_name = k.to_s + ".txt"
+        f = File.open(File.join(@download_dir, file_name), "w:utf-8")
+        puts "Getting #{k}"
+        data = RestClient::Request.execute(method: :get, url: v, timeout: 600)
+        f.write(data)
+        f.close
+      end
+      remove_header_text
+    end
+    def unpack; end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      harvester = PaleodbHarvester.new(@download_dir)
+      harvester.taxa
+      harvester.refs
+      harvester.taxa_refs
+      harvester.occurences
+      @taxa_json = JSON.parse(File.read(
+                                File.join(@download_dir, "json", "taxa.json")
+                              ), symbolize_names: true)
+      @name_id_json = JSON.parse(File.read(
+                                   File.join(@download_dir, "json", "name_id.json")
+                                 ), symbolize_names: true)
+      get_names
+      generate_dwca
+    end
+    private
+    def remove_header_text
+      URLS.each do |k, _v|
+        file_name = k.to_s + ".csv"
+        fout = File.open(File.join(@download_dir, file_name),
+                         "w:utf-8")
+        csv_started = false
+        File.open(File.join(@download_dir, k.to_s + ".txt")).each do |l|
+          unless csv_started
+            csv_started = true if l =~ /"Records:"/
+            next
+          end
+          fout.write(l)
+        end
+      end
+    end
+    def get_names
+      sp, syn = species
+      sp.each_with_index do |r, i|
+        puts format("Processing %s species", i) if (i % 5000).zero?
+        append_accepted_species(r)
+      end
+      syn.each_with_index do |r, i|
+        puts format("Processing %s synonyms", i) if (i % 5000).zero?
+        append_synonyms(r)
+      end
+    end
+    def append_accepted_species(row)
+      c = classification({}, row)
+      name = {
+        id: row[:id],
+        acc_id: row[:id],
+        klass: c[:class],
+        order: c[:order],
+        family: c[:family],
+        genus: c[:genus],
+        name: row[:name],
+        auth: row[:auth]
+      }
+      @names << name
+    end
+    def append_synonyms(row)
+      id, acc_id = synonymId(row)
+      syn = {
+        id: id,
+        name: row[:name],
+        auth: row[:auth],
+        acc_id: acc_id
+      }
+      @names << syn
+    end
+    def synonymId(row)
+      acc_id = row[:acc_id]
+      id = row[:id]
+      acc_id = @name_id_json[row[:acc_name].to_sym][:id] if id == acc_id
+      [id, acc_id]
+    rescue StandardError
+      puts "Unable to get synonymId"
+    end
+    def classification(data, row)
+      data = {}
+      stack = [[data, row]]
+      until stack.empty?
+        data, row = stack.delete_at(0)
+        next unless @taxa_json[row[:parent_id].to_sym] && row[:parent_id] != row[:id]
+        row = @taxa_json[row[:parent_id].to_sym]
+        data[row[:rank].to_sym] = row[:name] unless data[row[:rank].to_sym]
+        stack << [data, row]
+      end
+      data
+    end
+    def species
+      @taxa_json.values.select { |v| (v[:rank] == "species") }.
+        partition do |v|
+        (v[:name] == v[:acc_name]) || v[:acc_id].nil?
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id,
+                              "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
+      @names.each do |n|
+        name_string = "#{n[:name]} #{n[:auth]}".strip
+        @core << [n[:id], name_string, n[:acc_id],
+                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+                  n[:genus], n[:code]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          { email: "admin@paleobiodb.org" }
+        ],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        abstract: "The Paleobiology Database (PBDB) is a non-governmental, non-profit public resource for paleontological data. It has been organized and operated by a multi-disciplinary, multi-institutional, international group of paleobiological researchers. Its purpose is to provide global, collection-based occurrence and taxonomic data for organisms of all geological ages, as well data services to allow easy access to data for independent development of analytical tools, visualization software, and applications of all types. The Database’s broader goal is to encourage and enable data-driven collaborative efforts that address large-scale paleobiological questions.",
+        url: @url
+      }
+      super
+    end
+  end
+end