RubyGems - dwca_hunter - Versions diffs - 0.5.0 - Mend

dwca_hunter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/.byebug_history +31 -0
data/.document +5 -0
data/.gitignore +58 -0
data/.rspec +3 -0
data/.rubocop.yml +33 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +15 -0
data/Gemfile +3 -0
data/Gemfile.lock +133 -0
data/LICENSE.txt +20 -0
data/README.md +39 -0
data/Rakefile +11 -0
data/dwca_hunter.gemspec +42 -0
data/exe/dwcahunter +77 -0
data/files/birdlife_7.csv +11862 -0
data/files/fishbase_taxon_cache.tsv +81000 -0
data/files/reptile_checklist_2014_12.csv +15158 -0
data/lib/dwca_hunter/downloader.rb +60 -0
data/lib/dwca_hunter/encoding.rb +17 -0
data/lib/dwca_hunter/resource.rb +101 -0
data/lib/dwca_hunter/resources/arctos.rb +222 -0
data/lib/dwca_hunter/resources/birdlife.rb +160 -0
data/lib/dwca_hunter/resources/fishbase.rb +99 -0
data/lib/dwca_hunter/resources/freebase.rb +152 -0
data/lib/dwca_hunter/resources/gnub.rb +101 -0
data/lib/dwca_hunter/resources/itis.rb +271 -0
data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
data/lib/dwca_hunter/resources/ncbi.rb +174 -0
data/lib/dwca_hunter/resources/opentree.rb +121 -0
data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
data/lib/dwca_hunter/resources/worms.rb +176 -0
data/lib/dwca_hunter/url.rb +33 -0
data/lib/dwca_hunter/version.rb +7 -0
data/lib/dwca_hunter/xml.rb +33 -0
data/lib/dwca_hunter.rb +53 -0
metadata +250 -0

data/lib/dwca_hunter/downloader.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# encoding: utf-8
+module DwcaHunter
+  class Downloader
+    attr_reader :url
+    def initialize(source_url, file_path)
+      @source_url = source_url
+      @file_path = file_path
+      @url = Url.new(source_url)
+      @download_length = 0
+      @filename = nil
+    end
+    # downloads a given file into a specified filename.
+    # If block is given returns download progress
+    def download
+      raise "#{@source_url} is not accessible" unless @url.valid?
+      f = open(@file_path,'wb')
+      count = 0
+      @url.net_http.request_get(@url.path) do |r|
+        r.read_body do |s|
+          @download_length += s.length
+          f.write s
+          if block_given?
+            count += 1
+            if count % 100 == 0
+              yield @download_length
+            end
+          end
+        end
+      end
+      f.close
+      downloaded = @download_length
+      @download_length = 0
+      downloaded
+    end
+    def download_with_percentage
+      start_time = Time.now
+      download do |r|
+        percentage = r.to_f/@url.header.content_length * 100
+        elapsed_time = Time.now - start_time
+        eta = calculate_eta(percentage, elapsed_time)
+        res = { percentage: percentage,
+                elapsed_time: elapsed_time,
+                eta: eta }
+        yield res
+      end
+    end
+    protected
+    def calculate_eta(percentage, elapsed_time)
+      eta = elapsed_time/percentage * 100 - elapsed_time
+      eta = 1.0 if eta <= 0
+      eta
+    end
+  end
+end

data/lib/dwca_hunter/encoding.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module DwcaHunter
+  module Encoding
+    def self.latin1_to_utf8(file_path)
+      new_file = file_path + '.utf_8'
+      puts "Creating %s" % new_file
+      r = open(file_path)
+      w = open(new_file, 'w:utf-8')
+      r.each do |l|
+        l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
+        w.write l
+      end
+      r.close
+      w.close
+      new_file
+    end
+  end
+end

data/lib/dwca_hunter/resource.rb ADDED Viewed

@@ -0,0 +1,101 @@
+module DwcaHunter
+  class Resource
+    attr_reader :url, :uuid, :download_path, :title, :abbr, :command
+    def self.unzip(file, dir = nil)
+      Dir.chdir(dir) if dir
+      `unzip -qq -u #{file} > /dev/null 2>&1`
+    end
+    def initialize(opts)
+      @needs_download = !(opts[:download] == false)
+      @needs_unpack = !(opts[:unpack] == false)
+      @download_dir, @download_file = File.split(@download_path)
+      prepare_path if needs_download?
+    end
+    def needs_download?
+      @needs_download
+    end
+    def needs_unpack?
+      @needs_unpack
+    end
+    def download
+      DwcaHunter::logger_write(self.object_id,
+                               "Starting download of '%s'" % @url)
+      percentage = 0
+      if url.match(/^\s*http:\/\//)
+        dlr = DwcaHunter::Downloader.new(url, @download_path)
+        downloaded_length = dlr.download_with_percentage do |r|
+          if r[:percentage].to_i != percentage
+            percentage = r[:percentage].to_i
+            msg = "Downloaded %.0f%% in %.0f seconds ETA is %.0f seconds" %
+                          [percentage, r[:elapsed_time], r[:eta]]
+            DwcaHunter::logger_write(self.object_id, msg)
+          end
+        end
+        DwcaHunter::logger_write(self.object_id,
+                                 "Download finished, Size: %s" %
+                                  downloaded_length)
+      else
+        `curl -s #{url} > #{download_path}`
+      end
+    end
+    private
+    def cleanup(str)
+      str.strip!
+      str.to_i.to_s == str ? str.to_i : str
+    end
+    def prepare_path
+      FileUtils.rm_rf(@download_dir)
+      FileUtils.mkdir_p(@download_dir)
+    end
+    def unpack_bz2
+      DwcaHunter::logger_write(self.object_id,
+                               'Unpacking a bz2 file, it might take a while...')
+      Dir.chdir(@download_dir)
+      `bunzip2 #{@download_file}`
+    end
+    def unpack_zip
+      DwcaHunter::logger_write(self.object_id,
+                               'Unpacking a zip file, it might take a while...')
+      self.class.unzip(@download_file, @download_dir)
+    end
+    def unpack_gzip
+      DwcaHunter::logger_write(self.object_id,
+                               'Unpacking gzip file, it might take a while...')
+      self.class.gunzip(@download_file, @download_dir)
+    end
+    def unpack_tar
+      DwcaHunter::logger_write(self.object_id,
+                               'Unpacking a tar file, it might take a while...')
+      Dir.chdir(@download_dir)
+      `tar zxvf #{@download_file}`
+    end
+    def generate_dwca
+      gen = DarwinCore::Generator.new(File.join(@download_dir, 'dwca.tar.gz'))
+      gen.add_core(@core, 'taxa.txt')
+      @extensions.each_with_index do |extension, i|
+        gen.add_extension(extension[:data],
+                          extension[:file_name],
+                          true,
+                          extension[:row_type])
+      end
+      gen.add_meta_xml
+      gen.add_eml_xml(@eml)
+      gen.pack
+      DwcaHunter::logger_write(self.object_id,
+                               'DarwinCore Archive file is created')
+    end
+  end
+end

data/lib/dwca_hunter/resources/arctos.rb ADDED Viewed

@@ -0,0 +1,222 @@
+# encoding: utf-8
+module DwcaHunter
+  class ResourceArctos < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = 'arctos'
+      @title = 'Arctos'
+      @url = 'http://arctos.database.museum/download/gncombined.zip'
+      @UUID =  'eea8315d-a244-4625-859a-226675622312'
+      @download_path = File.join(Dir.tmpdir,
+                                 'dwca_hunter',
+                                 'arctos',
+                                 'data.tar.gz')
+      @synonyms = []
+      @names = []
+      @vernaculars = []
+      @extensions = []
+      super(opts)
+      @gnub_dir = File.join(@download_dir, 'gnub')
+    end
+    def unpack
+      unpack_zip
+    end
+    def make_dwca
+      DwcaHunter::logger_write(self.object_id, 'Extracting data')
+      get_names
+      generate_dwca
+    end
+    private
+    def get_names
+      Dir.chdir(@download_dir)
+      Dir.entries(@download_dir).grep(/zip$/).each do |file|
+        self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
+      end
+      collect_names
+      collect_synonyms
+      collect_vernaculars
+    end
+    def collect_vernaculars
+      file = open(File.join(@download_dir, 'common_name.csv'))
+      fields = {}
+      file.each_with_index do |row, i|
+        if i == 0
+          fields = get_fields(row)
+          next
+        end
+        row = split_row(row)
+        taxon_id = row[fields[:taxon_name_id]]
+        vernacular_name_string = row[fields[:common_name]]
+        @vernaculars << {
+          taxon_id: taxon_id,
+          vernacular_name_string: vernacular_name_string
+        }
+        puts "Processed %s vernaculars" % i if i % 10000 == 0
+      end
+    end
+    def collect_synonyms
+      file = open(File.join(@download_dir, 'taxon_relations.csv'))
+      fields = {}
+      file.each_with_index do |row, i|
+        if i == 0
+          fields = get_fields(row)
+          next
+        end
+        row = split_row(row)
+        taxon_id = row[fields[:taxon_name_id]]
+        @synonyms << {
+          taxon_id: row[fields[:related_taxon_name_id]],
+          local_id: taxon_id,
+          name_string: @names_index[taxon_id],
+          #synonym_authority:      row[fields[:relation_authority]],
+          taxonomic_status:       row[fields[:taxon_relationship]],
+        }
+        puts "Processed %s synonyms" % i if i % 10000 == 0
+      end
+    end
+    def collect_names
+      @names_index = {}
+      file = open(File.join(@download_dir, 'taxonomy.csv'))
+      fields = {}
+      file.each_with_index do |row, i|
+        if i == 0
+          fields = get_fields(row)
+          next
+        end
+        next unless  row[fields[:display_name]]
+        row = split_row(row)
+        taxon_id = row[fields[:taxon_name_id]]
+        name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
+        kingdom = row[fields[:kingdom]]
+        phylum = row[fields[:phylum]]
+        klass = row[fields[:phylclass]]
+        subclass = row[fields[:subclass]]
+        order = row[fields[:phylorder]]
+        suborder = row[fields[:suborder]]
+        superfamily = row[fields[:superfamily]]
+        family = row[fields[:family]]
+        subfamily = row[fields[:subfamily]]
+        tribe = row[fields[:tribe]]
+        genus = row[fields[:genus]]
+        subgenus = row[fields[:subgenus]]
+        species = row[fields[:species]]
+        subspecies = row[fields[:subspecies]]
+        code = row[fields[:nomenclatural_code]]
+        @names << { taxon_id: taxon_id,
+          local_id: taxon_id,
+          name_string: name_string,
+          kingdom: kingdom,
+          phylum: phylum,
+          klass: klass,
+          order: order,
+          family: family,
+          genus: genus,
+          code: code,
+        }
+        @names_index[taxon_id] = name_string
+        puts "Processed %s names" % i if i % 10000 == 0
+      end
+    end
+    def split_row(row)
+      row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
+      row.split('","')
+    end
+    def get_fields(row)
+      row = row.split(",")
+      encoding_options = {
+        :invalid           => :replace,
+        :undef             => :replace,
+        :replace           => '',
+        :universal_newline => true
+      }
+      num_ary = (0...row.size).to_a
+      row = row.map do |f|
+        f = f.strip.downcase
+        f = f.encode ::Encoding.find('ASCII'), encoding_options
+        f.to_sym
+      end
+      Hash[row.zip(num_ary)]
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
+        'http://globalnames.org/terms/localID',
+        'http://rs.tdwg.org/dwc/terms/scientificName',
+        'http://rs.tdwg.org/dwc/terms/kingdom',
+        'http://rs.tdwg.org/dwc/terms/phylum',
+        'http://rs.tdwg.org/dwc/terms/class',
+        'http://rs.tdwg.org/dwc/terms/order',
+        'http://rs.tdwg.org/dwc/terms/family',
+        'http://rs.tdwg.org/dwc/terms/genus',
+        'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
+        ]]
+      @names.each do |n|
+        @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
+          n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
+          n[:genus], n[:code]]
+      end
+      @extensions << {
+        data: [[
+          'http://rs.tdwg.org/dwc/terms/taxonID',
+          'http://rs.tdwg.org/dwc/terms/vernacularName']],
+        file_name: 'vernacular_names.txt',
+        row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
+      @vernaculars.each do |v|
+        @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
+      end
+      @extensions << {
+        data: [[
+          'http://rs.tdwg.org/dwc/terms/taxonID',
+          'http://globalnames.org/terms/localID',
+          'http://rs.tdwg.org/dwc/terms/scientificName',
+          'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
+          ]],
+        file_name: 'synonyms.txt',
+        }
+      @synonyms.each do |s|
+        @extensions[-1][:data] << [
+          s[:taxon_id], s[:local_id],
+          s[:name_string], s[:taxonomic_status]]
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [
+          {email: 'dustymc at gmail dot com'}
+      ],
+        metadata_providers: [
+          { first_name: 'Dmitry',
+            last_name: 'Mozzherin',
+            email: 'dmozzherin@gmail.com' }
+      ],
+        abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
+        url: @url
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/birdlife.rb ADDED Viewed

@@ -0,0 +1,160 @@
+module DwcaHunter
+  class ResourceBirdLife < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = "bird-life"
+      @title = "BirdLife International"
+      @uuid = "b1d8de7a-ab96-455f-acd8-f3fff2d7d169"
+      @data = []
+      @extensions = []
+      @url = "http://www.birdlife.org/datazone/userfiles"\
+             "/file/Species/Taxonomy/BirdLife_Checklist_Version_70.zip"
+      @download_path = File.join(Dir.tmpdir, "dwca_hunter", "birdlife",
+                                 "fake.zip")
+      @clades = {}
+      super
+    end
+    def needs_unpack?
+      false
+    end
+    def download
+    end
+    def make_dwca
+      organize_data
+      generate_dwca
+    end
+    private
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      core_init
+      extensions_init
+      eml_init
+      @data.each do |rec|
+        process(rec)
+      end
+      super
+    end
+    def core_init
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://globalnames.org/terms/localID",
+                "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
+                "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
+                "http://rs.tdwg.org/dwc/terms/taxonRank"]]
+      @count = 1
+      @core << [@count, nil, nil, @count, "Aves", nil, "class"]
+    end
+    def process(rec)
+      parent_id = 1
+      [:order, :family].each do |rank|
+        clade_id = nil
+        unless @clades[rec[rank]]
+          @count += 1
+          @clades[rec[rank]] = { id: @count }
+        end
+        clade_id = @clades[rec[rank]][:id]
+        @core << [clade_id, nil, parent_id, clade_id, rec[rank], nil, rank.to_s]
+        parent_id = clade_id
+      end
+      @count += 1
+      @core << [@count, rec[:local_id], parent_id, @count,
+                rec[:scientific_name], nil, rec[:rank]]
+      taxon = @core.last
+      process_synonyms(rec, taxon)
+      process_vernaculars(rec, taxon)
+    end
+    def process_synonyms(rec, taxon)
+      rec[:synonyms].each do |syn|
+        @count += 1
+        @core << [@count, nil, taxon[2], taxon[0], syn, "synonym", taxon[-1]]
+      end
+    end
+    def process_vernaculars(rec, taxon)
+      rec[:vernaculars].each do |v|
+        taxon_id = taxon[0]
+        lang = "en"
+        name = v
+        @extensions[0][:data] << [taxon_id, name, lang]
+      end
+    end
+    def extensions_init
+      @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
+                               "http://rs.tdwg.org/dwc/terms/vernacularName",
+                               "http://purl.org/dc/terms/language"]],
+                       file_name: "vernacular_names.txt",
+                       row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
+                     }
+    end
+    def organize_data
+      DwcaHunter::logger_write(self.object_id,
+                               "Organizing data")
+      path = File.join(__dir__, "..",
+                       "..", "files", "birdlife_7.csv")
+      opts = { headers: true, header_converters: :symbol }
+      collect_data(path, opts)
+    end
+    def collect_data(path, opts)
+      @data = CSV.open(path, opts).each_with_object([]) do |row, data|
+        order = row[:order]
+        order = order.capitalize if order.match(/^[A-Z]+$/)
+        family = row[:familyname]
+        scientific_name = [row[:scientificname], row[:authority]].join(" ").
+          strip.gsub(/[\s]+/, " ")
+        rank = row[:taxonomictreatment] == "R" ? "species" : "not recognized"
+        local_id = row[:sisrecid]
+        vernaculars = collect_vernaculars(row)
+        synonyms = collect_synonyms(row)
+        data << { order: order, family: family, rank: rank,
+                  scientific_name: scientific_name, synonyms: synonyms,
+                  local_id: local_id, vernaculars: vernaculars }
+      end
+    end
+    def collect_synonyms(row)
+      synonyms = row[:synonyms]
+      synonyms ? synonyms.split(";").map(&:strip) : []
+    end
+    def collect_vernaculars(row)
+      name1 = row[:commonname]
+      names = name1 ? [name1] : []
+      other = row[:alternativecommonnames]
+      if other
+        names += other.split(";").map(&:strip)
+      end
+      names
+    end
+    def eml_init
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [],
+        metadata_providers: [
+          { first_name: "Dmitry",
+            last_name: "Mozzherin",
+            email: "dmozzherin@gmail.com" }
+      ],
+        abstract: "BirdLife is widely recognised as the world leader in bird "\
+                  "conservation. Rigorous science informed by practical "\
+                  "feedback from projects on the ground in important sites "\
+                  "and habitats enables us to implement successful "\
+                  "conservation programmes for birds and all nature.",
+        url: "http://www.birdlife.org/"
+      }
+    end
+  end
+end

data/lib/dwca_hunter/resources/fishbase.rb ADDED Viewed

@@ -0,0 +1,99 @@
+module DwcaHunter
+  # Resource for FishBase
+  class ResourceFishbase < DwcaHunter::Resource
+    attr_reader :title, :abbr
+    def initialize(opts = {})
+      @command = "fishbase"
+      @title = "FishBase Cache"
+      @abbr = "FishBase Cache"
+      @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
+      @download_path = File.join(Dir.tmpdir, "dwca_hunter", "fishbase",
+                                 "fishbase.tsv")
+      @extensions = []
+      super
+    end
+    def download
+      FileUtils.cp(File.join(__dir__, "..", "..", "files",
+                             "fishbase_taxon_cache.tsv"), @download_path)
+    end
+    def unpack
+    end
+    def make_dwca
+      organize_data
+      generate_dwca
+    end
+    private
+    def organize_data
+      ranks = %i(class order family sub_family genus species)
+      DwcaHunter::logger_write(self.object_id,
+                               "Organizing data")
+      # snp = ScientificNameParser.new
+      @data = CSV.open(@download_path, col_sep: "\t")
+        .each_with_object([]) do |row, data|
+        cl = Hash[ranks.zip(row[4].split("|"))]
+        data << { taxon_id: row[0],
+                  local_id: row[0],
+                  scientific_name: row[1],
+                  rank: row[2],
+                  source: row[7]
+                }.merge(cl)
+      end
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      core_init
+      eml_init
+      DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
+      count = 0
+      @data.each do |d|
+        count += 1
+        if count % 10000 == 0
+          DwcaHunter::logger_write(self.object_id, "Core row #{count}")
+        end
+        @core << [d[:taxon_id], d[:taxon_id], d[:taxon_id],
+                  d[:scientific_name], d[:rank],
+                  d[:class], d[:order], d[:family], d[:genus],
+                  d[:source]]
+      end
+      super
+    end
+    def eml_init
+      @eml = {
+        id: @uuid,
+        title: @title,
+        authors: [],
+        metadata_providers: [
+          { first_name: "Jorrit",
+            last_name: "Poelen",
+          }
+      ],
+        abstract: "FishBase is a global species database of fish species" \
+                  "(specifically finfish). It is the largest and the most" \
+                  "extensively accessed online database of finfish",
+        url: "http://www.fishbase.org"
+      }
+    end
+    def core_init
+      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
+                "http://globalnames.org/terms/localID",
+                "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
+                "http://rs.tdwg.org/dwc/terms/scientificName",
+                "http://rs.tdwg.org/dwc/terms/taxonRank",
+                "http://rs.tdwg.org/dwc/terms/class",
+                "http://rs.tdwg.org/dwc/terms/order",
+                "http://rs.tdwg.org/dwc/terms/family",
+                "http://rs.tdwg.org/dwc/terms/genus",
+                "http://purl.org/dc/terms/source"]]
+    end
+  end
+end