RubyGems - dwca_hunter - Versions diffs - 0.5.0 - Mend

dwca_hunter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/.byebug_history +31 -0
data/.document +5 -0
data/.gitignore +58 -0
data/.rspec +3 -0
data/.rubocop.yml +33 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +15 -0
data/Gemfile +3 -0
data/Gemfile.lock +133 -0
data/LICENSE.txt +20 -0
data/README.md +39 -0
data/Rakefile +11 -0
data/dwca_hunter.gemspec +42 -0
data/exe/dwcahunter +77 -0
data/files/birdlife_7.csv +11862 -0
data/files/fishbase_taxon_cache.tsv +81000 -0
data/files/reptile_checklist_2014_12.csv +15158 -0
data/lib/dwca_hunter/downloader.rb +60 -0
data/lib/dwca_hunter/encoding.rb +17 -0
data/lib/dwca_hunter/resource.rb +101 -0
data/lib/dwca_hunter/resources/arctos.rb +222 -0
data/lib/dwca_hunter/resources/birdlife.rb +160 -0
data/lib/dwca_hunter/resources/fishbase.rb +99 -0
data/lib/dwca_hunter/resources/freebase.rb +152 -0
data/lib/dwca_hunter/resources/gnub.rb +101 -0
data/lib/dwca_hunter/resources/itis.rb +271 -0
data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
data/lib/dwca_hunter/resources/ncbi.rb +174 -0
data/lib/dwca_hunter/resources/opentree.rb +121 -0
data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
data/lib/dwca_hunter/resources/worms.rb +176 -0
data/lib/dwca_hunter/url.rb +33 -0
data/lib/dwca_hunter/version.rb +7 -0
data/lib/dwca_hunter/xml.rb +33 -0
data/lib/dwca_hunter.rb +53 -0
metadata +250 -0

data/lib/dwca_hunter/resources/wikispecies.rb ADDED Viewed

@@ -0,0 +1,350 @@
+# encoding: utf-8
+module DwcaHunter
+  class ResourceWikispecies < DwcaHunter::Resource
+    def initialize(opts = {})
+      @problems_file = open('problems.txt', 'w:utf-8')
+      @command = "wikispecies"
+      @title = 'Wikispecies'
+      @url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
+             'specieswiki-latest-pages-articles.xml.bz2'
+      @url = opts[:url] if opts[:url]
+      @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
+      @download_path = File.join(Dir.tmpdir,
+                                 'dwca_hunter',
+                                 'wikispecies',
+                                 'data.xml.bz2')
+      @data = []
+      @templates = {}
+      @taxon_ids = {}
+      @tree = {}
+      @paths = {}
+      @extensions = []
+      @re = {
+        page_start: /^\s*\<page\>\s*$/,
+        page_end: /^\s*\<\/page\>\s*$/,
+        template: /Template:/i,
+        template_link: /\{\{([^\}]*)\}\}/,
+        vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
+      }
+      super(opts)
+    end
+    def unpack
+      unpack_bz2
+    end
+    def make_dwca
+      enrich_data
+      extend_classification
+      generate_dwca
+    end
+  private
+    def enrich_data
+      DwcaHunter::logger_write(self.object_id,
+                               'Extracting data from xml file...')
+      Dir.chdir(@download_dir)
+      f = open('data.xml', 'r:utf-8')
+      page_on = false
+      page = ''
+      page_num = 0
+      f.each do |l|
+        if l.match(@re[:page_start])
+          page << l
+          page_on = true
+        elsif page_on
+          page << l
+          if l.match(@re[:page_end])
+            page_on = false
+            page_xml = Nokogiri::XML.parse(page)
+            template?(page_xml) ?
+              process_template(page_xml) :
+              process_species(page_xml)
+            page_num += 1
+            if page_num % BATCH_SIZE == 0
+              DwcaHunter::logger_write(self.object_id,
+                                       "Traversed %s pages" % page_num)
+            end
+            page = ''
+            @page_title = nil
+            @page_id = nil
+          end
+        end
+      end
+      DwcaHunter::logger_write(self.object_id,
+                               'Extracted total %s pages' % page_num)
+      f.close
+    end
+    def extend_classification
+      DwcaHunter::logger_write(self.object_id, 'Extending classifications')
+      @data.each_with_index do |d, i|
+        unless d[:classificationPath].empty?
+          n = 50
+          while n > 0
+            n -= 1
+            if n == 0
+              d[:classificationPath] = []
+              break
+            end
+            parent = @templates[d[:classificationPath].first]
+            if parent
+              d[:classificationPath].unshift(parent[:parentName])
+            else
+              update_tree(d[:classificationPath])
+              break
+            end
+          end
+        end
+        # d[:classificationPath] = d[:classificationPath].join("|").
+        # gsub("Main Page", "Life")
+        if i % BATCH_SIZE == 0 && i > 0
+          DwcaHunter::logger_write(self.object_id,
+                                   "Extended %s classifications" % i)
+        end
+      end
+    end
+    def update_tree(path)
+      path = path.dup
+      return if @paths.has_key?(path.join('|'))
+      (0...path.size).each do |i|
+        subpath = path[0..i]
+        subpath_string = subpath.join('|')
+        next if @paths.has_key?(subpath_string)
+        name = subpath.pop
+        tree_element = subpath.inject(@tree) { |res, n| res[n] }
+        tree_element[name] = {}
+        @paths[subpath_string] = 1
+      end
+    end
+    def process_template(x)
+      name = page_title(x).gsub!(@re[:template], '').strip
+      text = x.xpath('//text').text.strip
+      parent_name = text.match(@re[:template_link])
+      if parent_name
+        return if parent_name[1].match(/\#if/)
+        list = parent_name[1].split("|")
+        if list.size == 1
+          parent_name = list[0]
+        elsif list[0].match /Taxonav/i
+          parent_name = list[1]
+        else
+          parent_name = list[0]
+        end
+      end
+      name.gsub!(/_/, ' ')
+      parent_name.gsub!(/_/, ' ') if parent_name
+      @templates[name] = { parentName: parent_name, id: page_id(x) }
+    end
+    def process_species(x)
+      return if page_title(x).match(/Wikispecies/i)
+      items = find_species_components(x)
+      if items
+        @data << {
+          taxonId: page_id(x),
+          canonicalForm: page_title(x),
+          scientificName: page_title(x),
+          classificationPath: [],
+          vernacularNames: [] }
+        get_full_scientific_name(items)
+        get_vernacular_names(items)
+        init_classification_path(items)
+      end
+    end
+    def get_full_scientific_name(items)
+      if items['name']
+        if name = items['name'][0]
+          @data[-1][:scientificName] = parse_name(name, @data[-1])
+        else
+          @problems_file.write("%s\n" % @data[-1][:canonicalForm])
+        end
+      end
+    end
+    def get_vernacular_names(items)
+      if items['vernacular names'] && items['vernacular names'].size > 0
+        vn_string = items['vernacular names'].join("")
+        vn = vn_string.match(@re[:vernacular_names])
+        if vn
+          vn_list = vn[1].strip.split("|")
+          vnames = []
+          vn_list.each do |item|
+            language, name = item.split("=").map { |x| x.strip }
+            if language && name && language.size < 4 && name.valid_encoding?
+              vnames << {
+                name: name,
+                language: language }
+            end
+          end
+          @data[-1][:vernacularNames] = vnames
+        end
+      end
+    end
+    def init_classification_path(items)
+      if items['taxonavigation']
+        items['taxonavigation'].each do |line|
+          line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
+          if template_link = line.match(@re[:template_link])
+            template_link = template_link[1].
+              strip.gsub(/Template:/, '').gsub(/_/, ' ')
+            if !template_link.match(/\|/)
+              @data[-1][:classificationPath] << template_link
+              break
+            end
+          end
+        end
+      end
+    end
+    def find_species_components(x)
+      items = get_items(x.xpath('//text').text)
+      is_taxon_item = items.has_key?('name') ||
+                      items.has_key?('taxonavigation')
+      return nil unless is_taxon_item
+      items
+    end
+    def get_items(txt)
+      item_on = false
+      items = {}
+      current_item = nil
+      txt.split("\n").each do |l|
+        item =  l.match(/[\=]+([^\=]+)[\=]+/)
+        if item
+          current_item = item[1].strip.downcase
+          items[current_item] = []
+        elsif current_item && !l.empty?
+          items[current_item] << l
+        end
+      end
+      items
+    end
+    def page_title(x)
+      @page_title ||= x.xpath('//title').first.text
+    end
+    def page_id(x)
+      @page_id ||= x.xpath('//id').first.text
+    end
+    def template?(page_xml)
+      !!page_title(page_xml).match(@re[:template])
+    end
+    def parse_name(name_string, taxa)
+      name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
+      name_string = name_string.strip
+      old_l = name_string.dup
+      name_string.gsub! /^\*\s*/, ''
+      name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
+      name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
+      name_string.gsub!(/[']{2,}/, ' ')
+      name_string.gsub!(/["]{2,}/, ' ')
+      name_string.gsub!(/\:\s*\d.*$/, '')
+      name_string.gsub!(/,\s*\[RSD\]/i, '')
+      name_string.gsub!(/^\s*†\s*/, '')
+      name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
+      # name_string = DwcaHunter::XML.unescape(name_string)
+      name_string.gsub!(/\<nowiki\>.*$/, '')
+      name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
+      name_string.gsub!(/^\s*\&dagger;\s*/, '')
+      name_string.gsub!(/&nbsp;/, ' ')
+      name_string.gsub!(/\s+/, ' ')
+      name_string = name_string.strip
+      # puts "%s---%s" % [name_string, old_l]
+      return name_string
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      @core = [
+        ['http://rs.tdwg.org/dwc/terms/taxonID',
+         'http://rs.tdwg.org/dwc/terms/scientificName',
+         'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
+         'http://globalnames.org/terms/canonicalForm',
+         'http://rs.tdwg.org/dwc/terms/higherClassification',
+         'http://purl.org/dc/terms/source']
+      ]
+      DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
+      count = 0
+      @data.map do |d|
+        count += 1
+        if count % BATCH_SIZE == 0
+          DwcaHunter::logger_write(self.object_id,
+                                   "Traversing %s core data record" % count)
+        end
+        taxon_id = (d[:classificationPath].empty? ?
+                    d[:taxonId] :
+                    @templates[d[:classificationPath].
+                      last][:id]) rescue d[:taxonId]
+        @taxon_ids[d[:taxonId]] = taxon_id
+        parentNameUsageId = (d[:classificationPath].size > 1 ?
+                             @templates[d[:classificationPath][-2]][:id] :
+                             nil) rescue nil
+        url = 'http://species.wikimedia.org/wiki/' +
+          URI.encode(d[:canonicalForm].gsub(' ', '_'))
+        path = d[:classificationPath]
+        path.pop if path[-1] == d[:canonicalForm]
+        canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
+        scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
+                           canonical_form :
+                           d[:scientificName]
+        @core << [taxon_id,
+                  scientific_name,
+                  parentNameUsageId,
+                  canonical_form,
+                  path.join('|'),
+                  url]
+      end
+      @extensions << { data: [[
+        'http://rs.tdwg.org/dwc/terms/TaxonID',
+        'http://rs.tdwg.org/dwc/terms/vernacularName',
+        'http://purl.org/dc/terms/language'
+      ]], file_name: 'vernacular_names.txt' }
+      DwcaHunter::logger_write(self.object_id,
+              'Creating verncaular name extension for DarwinCore Archive file')
+      count = 0
+      @data.each do |d|
+        count += 1
+        if count % BATCH_SIZE == 0
+          DwcaHunter::logger_write(self.object_id,
+                                 "Traversing %s extension data record" % count)
+        end
+        d[:vernacularNames].each do |vn|
+          taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
+          if taxon_id
+            @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
+          end
+        end
+      end
+      @eml = {
+        id: @uuid,
+        title: @title,
+        license: 'http://creativecommons.org/licenses/by-sa/3.0/',
+        authors: [
+          { first_name: 'Stephen',
+            last_name: 'Thorpe',
+            email: 'stephen_thorpe@yahoo.co.nz',
+            url: 'http://species.wikimedia.org/wiki/Main_Page' }],
+        abstract: 'The free species directory that anyone can edit.',
+        metadata_providers: [
+          { first_name: 'Dmitry',
+            last_name: 'Mozzherin',
+            email: 'dmozzherin@mbl.edu' }],
+        url: 'http://species.wikimedia.org/wiki/Main_Page'
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/resources/worms.rb ADDED Viewed

@@ -0,0 +1,176 @@
+# encoding: utf-8
+module DwcaHunter
+  class ResourceWoRMS < DwcaHunter::Resource
+    def initialize(opts = {})
+      @command = 'worms'
+      @title = 'WoRMS'
+      @url = 'http://content60.eol.org/resources/26.tar.gz'
+      @uuid =  '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
+      @download_path = File.join(Dir.tmpdir,
+                                 'dwca_hunter',
+                                 'worms',
+                                 'data.tar.gz')
+      @fields = ['dc:identifier',
+                 'dc:source',
+                 'dwc:Kingdom',
+                 'dwc:Phylum',
+                 'dwc:Class',
+                 'dwc:Order',
+                 'dwc:Family',
+                 'dwc:Genus',
+                 'dwc:ScientificName']
+      @rank = { 1 => 'kingdom',
+                2 => 'phylum',
+                3 => 'class',
+                4 => 'order',
+                5 => 'family',
+                6 => 'genus',
+                7 => 'species' }
+      @known_paths = {}
+      @data = []
+      @extensions = []
+      @extensions << { data: [[
+        'http://rs.tdwg.org/dwc/terms/taxonId',
+        'http://rs.tdwg.org/dwc/terms/scientificName']],
+        file_name: 'synonyms.txt' }
+      @re = {
+        cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
+      }
+      @core = [[
+        'http://rs.tdwg.org/dwc/terms/taxonID',
+        'http://purl.org/dc/terms/parentNameUsageID',
+        'http://purl.org/dc/terms/source',
+        'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
+        'http://purl.org/dc/terms/scientificName',
+        'http://purl.org/dc/terms/taxonRank']]
+      super
+    end
+    def unpack
+      unpack_tar
+    end
+    def make_dwca
+      collect_data
+      make_core_data
+      generate_dwca
+    end
+    private
+    def collect_data
+      DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
+      xml_file = File.join(@download_dir, '26.xml')
+      f = open(xml_file, 'r:utf-8')
+      in_taxon = false
+      taxon = nil
+      count = 0
+      Nokogiri::XML::Reader(f).each do |node|
+        if !in_taxon && node.name == 'taxon'
+          in_taxon = true
+          taxon = {}
+          @fields.each { |field| taxon[field.to_sym] = nil }
+          taxon[:synonyms] = []
+        elsif in_taxon && node.name == 'taxon'
+          in_taxon = false
+          @data << taxon
+          taxon = nil
+          count += 1
+          if count % BATCH_SIZE == 0
+            DwcaHunter::logger_write(self.object_id,
+                                     "Extracted %s taxons" % count)
+          end
+        elsif in_taxon
+          item = node.name.to_sym
+          if taxon.has_key?(item) && !taxon[item]
+            text = node.inner_xml
+            if cdata = text.match(@re[:cdata])
+              text = cdata[1]
+            else
+              text = DwcaHunter::XML.unescape(text)
+            end
+            taxon[item] = text
+          elsif node.name == 'synonym' &&
+              (cdata = node.inner_xml.match(@re[:cdata]))
+            taxon[:synonyms] << cdata[1]
+          end
+        end
+      end
+    end
+    def get_gn_id(path_string)
+      gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
+      id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
+      "gn:" + id
+    end
+    def make_core_data
+      DwcaHunter::logger_write(self.object_id, 'Creating core data')
+      @data.each_with_index do |taxa, i|
+        if i % BATCH_SIZE == 0
+          DwcaHunter::logger_write(self.object_id,
+                                   'Traversing %s species for core' % i)
+        end
+        path = get_path(taxa)
+        parent_id = get_gn_id(path.join('|'))
+        @core << [taxa[:'dc:identifier'],
+                  parent_id, taxa[:'dc:source'],
+                  nil,
+                  taxa[:'dwc:ScientificName'],
+                  'species']
+        taxa[:synonyms].each do |synonym|
+          @extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
+        end
+        until path.empty?
+          path_string = path.join("|")
+          unless @known_paths[path_string]
+            @known_paths[path_string] = 1
+            parent_id = (path.size == 1) ?
+                        nil :
+                        get_gn_id([path[0..-2]].join('|'))
+            id = get_gn_id(path_string)
+            @core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
+          end
+          path.pop
+        end
+      end
+    end
+    def get_path(taxa)
+      path = []
+      @fields[2..-2].each do |field|
+        path << taxa[field.to_sym]
+      end
+      path
+    end
+    def generate_dwca
+      DwcaHunter::logger_write(self.object_id,
+                               'Creating DarwinCore Archive file')
+      @eml = {
+          id: @uuid,
+          title: @title,
+          authors: [
+            { email: 'info@marinespecies.org',
+              url: 'http://www.marinespecies.org' }
+          ],
+          metadata_providers: [
+            { first_name: 'Dmitry',
+              last_name: 'Mozzherin',
+              email: 'dmozzherin@gmail.com' }
+            ],
+          abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
+                    'is to provide an authoritative and comprehensive list ' +
+                    'of names of marine organisms, including information ' +
+                    'on synonymy. While highest priority goes to valid ' +
+                    'names, other names in use are included so that this ' +
+                    'register can serve as a guide to interpret taxonomic ' +
+                    'literature.',
+      }
+      super
+    end
+  end
+end

data/lib/dwca_hunter/url.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module DwcaHunter
+  class Url
+    attr_reader :net_http, :path, :header
+    def initialize(url)
+      @url = url
+      @parsed_url = URI.parse(url.strip)
+      @path = @parsed_url.path == '' ? '/' : @parsed_url.path
+      @net_http = Net::HTTP.new(@parsed_url.host, @parsed_url.port)
+      @header = get_header
+    end
+    # confirm that the passed in URL is valid and responses with a proper code
+    def valid?
+        @header && ['200','301','302'].include?(@header.code)
+    end
+    def content_length
+      header ? header.content_length : nil
+    end
+  private
+    def get_header
+      begin
+        return @net_http.head(@path)
+      rescue SocketError
+        return nil
+      end
+    end
+  end
+end

data/lib/dwca_hunter/version.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module DwcaHunter
+  VERSION = "0.5.0"
+  def self.version
+    VERSION
+  end
+end

data/lib/dwca_hunter/xml.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module DwcaHunter
+  module XML
+    def self.escape(input)
+      result = input.dup.strip
+      result.gsub!(/[&<>'"]/) do | match |
+          case match
+          when '&' then '&amp;'
+          when '<' then '&lt;'
+          when '>' then '&gt;'
+          when "'" then '&apos;'
+          when '"' then '&quot;'
+          end
+      end
+      result
+    end
+    def self.unescape(input)
+      result = input.dup.strip
+      result.gsub!(/&[a-z]+;/) do | match |
+          case match
+          when '&amp;'  then '&'
+          when '&lt;'   then '<'
+          when '&gt;'   then '>'
+          when '&apos;' then "'"
+          when '&quot;' then '"'
+          end
+      end
+      result
+    end
+  end
+end

data/lib/dwca_hunter.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+require "logger"
+require "fileutils"
+require "uri"
+require "tmpdir"
+require "net/http"
+require "json"
+require "dwc_archive"
+require "dwca_hunter/resource"
+require "rest_client"
+require "base64"
+Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
+  each { |f| require f }
+Dir[File.join(__dir__, "dwca_hunter", "resources", "*.rb")].
+  each { |f| require f }
+# DwcaHunter a namespace module for the project.
+module DwcaHunter
+  BATCH_SIZE = 10_000
+  class << self
+    attr_reader :resource
+    def logger
+      @logger ||= Logger.new(nil)
+    end
+    attr_writer :logger
+    def logger_reset
+      self.logger = Logger.new(nil)
+    end
+    def logger_write(obj_id, message, method = :info)
+      logger.send(method, "|#{obj_id}|#{message}|")
+    end
+    def process(resource)
+      resource.download if resource.needs_download?
+      resource.unpack if resource.needs_unpack?
+      resource.make_dwca
+    end
+    def resources
+      ObjectSpace.each_object(Class).select do |c|
+        c < Resource
+      end
+    end
+  end
+end