RubyGems - dwca_hunter - Versions diffs - 0.5.0 - Mend

dwca_hunter 0.5.0

Files changed (38) hide show

checksums.yaml +7 -0
data/.byebug_history +31 -0
data/.document +5 -0
data/.gitignore +58 -0
data/.rspec +3 -0
data/.rubocop.yml +33 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +15 -0
data/Gemfile +3 -0
data/Gemfile.lock +133 -0
data/LICENSE.txt +20 -0
data/README.md +39 -0
data/Rakefile +11 -0
data/dwca_hunter.gemspec +42 -0
data/exe/dwcahunter +77 -0
data/files/birdlife_7.csv +11862 -0
data/files/fishbase_taxon_cache.tsv +81000 -0
data/files/reptile_checklist_2014_12.csv +15158 -0
data/lib/dwca_hunter/downloader.rb +60 -0
data/lib/dwca_hunter/encoding.rb +17 -0
data/lib/dwca_hunter/resource.rb +101 -0
data/lib/dwca_hunter/resources/arctos.rb +222 -0
data/lib/dwca_hunter/resources/birdlife.rb +160 -0
data/lib/dwca_hunter/resources/fishbase.rb +99 -0
data/lib/dwca_hunter/resources/freebase.rb +152 -0
data/lib/dwca_hunter/resources/gnub.rb +101 -0
data/lib/dwca_hunter/resources/itis.rb +271 -0
data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
data/lib/dwca_hunter/resources/ncbi.rb +174 -0
data/lib/dwca_hunter/resources/opentree.rb +121 -0
data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
data/lib/dwca_hunter/resources/worms.rb +176 -0
data/lib/dwca_hunter/url.rb +33 -0
data/lib/dwca_hunter/version.rb +7 -0
data/lib/dwca_hunter/xml.rb +33 -0
data/lib/dwca_hunter.rb +53 -0
metadata +250 -0

data/lib/dwca_hunter/resources/ncbi.rb ADDED Viewed

@@ -0,0 +1,174 @@
+# encoding: utf-8
+module DwcaHunter
+  class ResourceNCBI < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = 'ncbi'
+      @title = 'NCBI'
+      @url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
+      @uuid =  '97d7633b-5f79-4307-a397-3c29402d9311'
+      @download_path = File.join(Dir.tmpdir,
+                                 'dwca_hunter',
+                                 'ncbi',
+                                 'data.tar.gz')
+      @names = {}
+      @data = []
+      @collected_names = ['genbank common name', 'common name', 'valid']
+      @core = []
+      @extensions = []
+      super
+    end
+    def unpack
+      unpack_tar
+    end
+    def make_dwca
+      set_vars
+      get_names
+      get_classification
+      generate_dwca
+    end
+    private
+    def set_vars
+      @names_file = File.join(@download_dir, 'names.dmp')
+      @nodes_file = File.join(@download_dir, 'nodes.dmp')
+    end
+    def get_names
+      DwcaHunter::logger_write(object_id, 'Collecting names...')
+      open(@names_file).each_with_index do |line, i|
+        if i > 0 && i % BATCH_SIZE == 0
+          DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
+        end
+        line = line.split("|").map {|l| cleanup(l)}
+        id = line[0]
+        next if id == 1
+        name = line[1]
+        name_type = line[3]
+        name_type = 'valid' if name_type == 'scientific name'
+        begin
+          name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
+                      gsub(/\s+/, ' ')
+        rescue NoMethodError
+          puts "wrong name: %s" % name
+          next
+        end
+        @names[id] = {} unless @names[id]
+        @names[id][name_type] ?
+          (@names[id][name_type] << name) :
+          (@names[id][name_type] = [name])
+      end
+    end
+    def get_classification
+      DwcaHunter.logger_write(object_id, "Building classification...")
+      open(@nodes_file, "r:utf-8").each_with_index do |line, i|
+        if i > 0 && i % BATCH_SIZE == 0
+          DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
+        end
+        line = line.split('|').map {|l| cleanup(l)}
+        id = line[0]
+        next if id == 1
+        parent_tax_id = line[1]
+        rank = line[2]
+        hidden_flag = line[10]
+        comments = line[12]
+        rank = "" if rank == "no rank"
+        parent_tax_id = nil if parent_tax_id == 1
+        next unless @names[id] && @names[id]["valid"]
+        vernacular_names = []
+        synonyms = []
+        @names[id].keys.each do |k|
+          if @collected_names.include? k
+            vernacular_names += @names[id][k] if k != "valid"
+          else
+            synonyms << { scientificName: @names[id][k],
+                          taxonomicStatus: k }
+          end
+        end
+        @data << {
+          id: id,
+          scientificName: @names[id]["valid"][0],
+          parentNameUsageId: parent_tax_id,
+          taxonRank: rank,
+          taxonomicStatus: "valid",
+          vernacularNames: vernacular_names,
+          synonyms: []
+        }
+        @names[id].keys.each do |k|
+        end
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
+                "http://purl.org/dc/terms/scientificName",
+                "http://purl.org/dc/terms/parentNameUsageId",
+                "http://purl.org/dc/terms/taxonRank"]]
+      DwcaHunter.logger_write(object_id, "Assembling Core Data")
+      count = 0
+      @data.map do |d|
+        count += 1
+        if (count % BATCH_SIZE).zero?
+          DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
+                                  "data record" % count)
+        end
+        @core << [d[:id],
+                  d[:scientificName],
+                  d[:parentNameUsageId],
+                  d[:taxonRank]]
+      end
+      @extensions << {
+        data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
+                "http://rs.tdwg.org/dwc/terms/vernacularName"]],
+        file_name: "vernacular_names.txt"
+      }
+      @extensions << { data: [[
+        "http://rs.tdwg.org/dwc/terms/taxonId",
+        "http://rs.tdwg.org/dwc/terms/scientificName",
+        "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
+      ]],
+                       file_name: "synonyms.txt" }
+      DwcaHunter.logger_write(object_id, "Creating verncaular name " \
+                              "extension for DarwinCore Archive file")
+      count = 0
+      @data.each do |d|
+        count += 1
+        if (count % BATCH_SIZE).zero?
+          DwcaHunter.logger_write(object_id,
+                                  "Traversing #{count} extension data record")
+        end
+        d[:vernacularNames].each do |vn|
+          @extensions[0][:data] << [d[:id], vn]
+        end
+        d[:synonyms].each do |synonym|
+          @extensions[1][:data] << [d[:id],
+                                    synonym[:scientificName],
+                                    synonym[:taxonomicStatus]]
+        end
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [{ url: "http://www.ncbi.org" }],
+        abstract: "The National Center for Biotechnology Information " \
+                  "advances science and health by providing access to " \
+                  "biomedical and genomic information.",
+        metadata_providers: [
+          { first_name: "mitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@mbl.edu" }
+        ],
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/opentree.rb ADDED Viewed

@@ -0,0 +1,121 @@
+# frozen_string_literal: true
+module DwcaHunter
+  # Harvesting resource for Open Tree of Life
+  class ResourceOpenTree < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "open-tree"
+      @title = "Open Tree of Life Reference Taxonomy"
+      @uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
+      @data = []
+      @extensions = []
+      @count = 1
+      @clades = {}
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
+                "http://globalnames.org/terms/localID",
+                "http://purl.org/dc/terms/scientificName",
+                "http://purl.org/dc/terms/parentNameUsageId",
+                "http://purl.org/dc/terms/taxonRank",
+                "http://globalnames.org/ottCrossMaps",
+                "http://globalnames.org/ottNotes"]]
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [{ url: "https://tree.opentreeoflife.org" }],
+        abstract: "Open Tree of Life aims to construct a comprehensive, " \
+                  "dynamic and digitally-available tree of life by " \
+                  "synthesizing published phylogenetic trees along with" \
+                  "taxonomic data. The project is a collaborative effort" \
+                  "between 11 PIs across 10 institutions.",
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+        ],
+        url: @url
+      }
+      @url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
+      @download_path = File.join(Dir.tmpdir, "dwca_hunter",
+                                 "opentree", "data.tar.gz")
+      super
+    end
+    def unpack
+      unpack_tar if @needs_unpack
+    end
+    def make_dwca
+      DwcaHunter.logger_write(object_id, "Extracting data")
+      collect_data
+      generate_dwca
+    end
+    def download
+      return unless @needs_download
+      DwcaHunter.logger_write(object_id, "Downloading file -- "\
+                               "it will take some time...")
+      dlr = DwcaHunter::Downloader.new(url, @download_path)
+      dlr.download
+    end
+    private
+    def collect_data
+      set_vars
+      classification
+    end
+    def set_vars
+      @taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
+      @synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
+    end
+    def classification
+      @classification = []
+      @names = {}
+      DwcaHunter.logger_write(object_id, "Building classification")
+      open(@taxonomy).each_with_index do |line, i|
+        if ((i + 1) % BATCH_SIZE).zero?
+          DwcaHunter.logger_write(object_id,
+                                  "Traversed #{i + 1} taxonomy lines")
+        end
+        @classification << line.split("|").map(&:strip)
+      end
+    end
+    def generate_dwca
+      DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
+      DwcaHunter.logger_write(object_id, "Assembling Core Data")
+      generate_core
+      generate_synonyms
+      super
+    end
+    def generate_core
+      @classification.each do |d|
+        if (@count % BATCH_SIZE).zero?
+          DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
+                                  "data record")
+        end
+        @core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
+      end
+    end
+    def synonyms
+      []
+    end
+    def generate_synonyms
+      @extensions <<
+        { data: [["http://rs.tdwg.org/dwc/terms/taxonId",
+                  "http://rs.tdwg.org/dwc/terms/scientificName",
+                  "http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
+          file_name: "synonyms.txt" }
+      synonyms.each do |synonym|
+        @extensions.first[:data] << [d[:id], synonym[:scientificName],
+                                     synonym[:taxonomicStatus]]
+      end
+    end
+  end
+end

data/lib/dwca_hunter/resources/reptiles_checklist.rb ADDED Viewed

@@ -0,0 +1,139 @@
+# encoding: utf-8
+require 'biodiversity'
+require 'csv'
+module DwcaHunter
+  class ResourceReptilesChecklist < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "reptile-database"
+      @title = "The Reptile Database"
+      @uuid = "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8"
+      @data = []
+      @extensions = []
+      @download_path = File.join(Dir.tmpdir, 'dwca_hunter',
+                                 'reptilesdb', 'fake.tar.gz')
+      super
+    end
+    def needs_unpack?
+      false
+    end
+    def download
+    end
+    def make_dwca
+      organize_data
+      generate_dwca
+    end
+    private
+    def organize_data
+      DwcaHunter::logger_write(self.object_id,
+                               "Organizing data")
+      path = File.join(__dir__, "..",
+                       "..", "files", "reptile_checklist_2014_12.csv")
+      snp = ScientificNameParser.new
+      @data = CSV.open(path).each_with_object([]) do |row, data|
+        res = {}
+        name = row[0..1].join(" ")
+        res[:species] = snp.parse(name)[:scientificName][:normalized]
+        res[:subspecies] = []
+        if row[2]
+          row[2].split("\n").each do |ssp|
+            res[:subspecies] << snp.parse(ssp)[:scientificName][:normalized]
+          end
+        end
+        res[:vernaculars] = []
+        if row[3]
+          row[3].split("\n").each do |v|
+            lang = "en"
+            v.gsub!(/^E: /, '')
+            v.gsub!(/^G: /) do |m|
+              lang = "de" if m
+              ""
+            end
+            v.split(",").each do |name|
+              res[:vernaculars] << { name: name.strip, lang: lang }
+            end
+          end
+        end
+        if row[4]
+          res[:family] = row[4].match(/^[A-Za-z]+/)[0]
+        end
+        data << res
+      end
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               "Creating DarwinCore Archive file")
+      @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
+                'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
+                'http://rs.tdwg.org/dwc/terms/scientificName',
+                'http://rs.tdwg.org/dwc/terms/taxonRank']]
+      @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
+                               'http://rs.tdwg.org/dwc/terms/vernacularName',
+                               'http://purl.org/dc/terms/language']],
+                               file_name: 'vernacular_names.txt',
+                               row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
+      }
+      families = {}
+      count = 1
+      class_id = count
+      @core << [count, nil, "Reptilia", "class"]
+      @data.each_with_index do |record|
+        count += 1
+        family_id = families[record[:family]]
+        unless family_id
+          count += 1
+          family_id = count
+          families[record[:family]] = family_id
+          @core << [family_id, class_id, record[:family], "family"]
+        end
+        count += 1
+        species_id = count
+        @core << [species_id, family_id, record[:species], "species"]
+        record[:vernaculars].each do |v|
+          @extensions[0][:data] << [species_id, v[:name], v[:lang]]
+        end
+        record[:subspecies].each do |ssp|
+          count += 1
+          row = [count, species_id, ssp, "subspecies"]
+          @core << row
+        end
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          {
+            first_name: "Peter",
+            last_name: "Uetz",
+            email: "info@reptile-database_org"
+          },
+          {
+            first_name: "Jiri",
+            last_name: "Hosek",
+            email: "jiri.hosek@reptarium.cz"
+          }
+        ],
+        metadata_providers: [
+          { first_name: 'Dmitry',
+            last_name: 'Mozzherin',
+            email: 'dmozzherin@gmail.com' }
+        ],
+        abstract: "This database provides a catalogue of all living reptile "\
+        "species and their classification. The database covers "\
+        "all living snakes, lizards, turtles, amphisbaenians, "\
+        "tuataras, and crocodiles. Currently there are about "\
+        "9,500 species including another 2,800 subspecies "\
+        "(statistics). The database focuses on taxonomic data, "\
+        "i.e. names and synonyms, distribution and type data "\
+        "and literature references.",
+        url: "http://www.reptile-database.org"
+      }
+      super
+    end
+  end
+end