RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/lib/rbbt/sources/biomart.rb DELETED

@@ -1,105 +0,0 @@
-require 'rbbt/util/open'
-require 'rbbt'
-# This module interacts with BioMart. It performs queries to BioMart and
-# synthesises a hash with the results. Note that this module connects to the
-# online BioMart WS using the Open in 'rbbt/util/open' module which offers
-# caching by default. To obtain up to date results you may need to clear the
-# cache from previous queries.
-module BioMart
-  class BioMart::QueryError < StandardError; end
-  private
-  @@biomart_query_xml = <<-EOT
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE Query>
-<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
-<Dataset name = "<!--DATABASE-->" interface = "default" >
-<!--FILTERS-->
-<!--MAIN-->
-<!--ATTRIBUTES-->
-</Dataset>
-</Query>
-  EOT
-  def self.get(database, main, attrs = nil, filters = nil, data = nil)
-    attrs   ||= []
-    filters ||= ["with_#{main}"]
-    data    ||= {}
-    query = @@biomart_query_xml.clone
-    query.sub!(/<!--DATABASE-->/,database)
-    query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
-    query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
-    query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
-    response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
-    if response =~ /Query ERROR:/
-      raise BioMart::QueryError, response
-    end
-    response.each_line{|l|
-      parts = l.chomp.split(/\t/)
-      main = parts.shift
-      next if main.nil? || main.empty?
-      data[main] ||= {}
-      attrs.each{|name|
-        value = parts.shift
-        data[main][name] ||= []
-        next if value.nil?
-        data[main][name] << value
-      }
-    }
-    data
-  end
-  public
-  # This method performs a query in biomart for a datasets and a given set of
-  # attributes, there must be a main attribute that will be used as the key in
-  # the result hash, optionally there may be a list of additional attributes
-  # and filters. The data parameter at the end is used internally to
-  # incrementally building the result, due to a limitation of the BioMart WS
-  # that only allows 3 external arguments, users normally should leave it
-  # unspecified or nil. The result is a hash, where the keys are the different
-  # values for the main attribute, and the value is a hash with every other
-  # attribute as key, and as value and array with all possible values (Note
-  # that for a given value of the main attribute, there may be more than one
-  # value for another attribute). If filters is left a nil it adds a filter to
-  # the BioMart query to remove results with the main attribute empty, this may
-  # cause an error if the BioMart WS does not allow filtering with that
-  # attribute.
-  def self.query(database, main, attrs = nil, filters = nil, data = nil)
-    attrs   ||= []
-    data    ||= {}
-    chunks = []
-    chunk = []
-    attrs.each{|a|
-      chunk << a
-      if chunk.length == 2
-        chunks << chunk
-        chunk = []
-      end
-    }
-    chunks << chunk if chunk.any?
-    chunks.each{|chunk|
-      data = get(database,main,chunk, filters, data)
-    }
-    data
-  end
-end

data/lib/rbbt/sources/entrez.rb DELETED

@@ -1,211 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/util/tmpfile'
-require 'rbbt/util/filecache'
-require 'rbbt/bow/bow.rb'
-require 'set'
-# This module is used to parse and extract information from the
-# gene_info file at Entrez Gene, as well as from the gene2pubmed file.
-# Both need to be downloaded and accesible for Rbbt, which is done as
-# part of a normal installation.
-module Entrez
-  class NoFileError < StandardError; end
-  # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
-  # where each key is the entrez id of a gene, and the value is an array
-  # of possible synonyms in other databases. Is mostly used to translate
-  # entrez ids to the native database id of the organism. The parameter
-  # +native+ specifies the position of the key containing synonym, the
-  # fifth by default, +fix+ and +check+ are Procs used, if present, to
-  # pre-process lines and to check if they should be processed.
-  def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
-    raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
-    native ||= 5
-    taxs = [taxs] unless taxs.is_a?(Array)
-    taxs = taxs.collect{|t| t.to_s}
-    lexicon = {}
-    tmp = TmpFile.tmp_file("entrez-")
-    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
-    File.open(tmp).each{|l|
-      parts = l.chomp.split(/\t/)
-      next if parts[native] == '-'
-      entrez = parts[1]
-      parts[native].split(/\|/).each{|id|
-        id = fix.call(id) if fix
-        next if check && !check.call(id)
-        lexicon[entrez] ||= []
-        lexicon[entrez] << id
-      }
-    }
-    FileUtils.rm tmp
-    lexicon
-  end
-  # For a given taxonomy, or set of taxonomies, it returns a hash with
-  # genes as keys and arrays of related PubMed ids as values, as
-  # extracted from the gene2pubmed file from Entrez Gene.
-  def self.entrez2pubmed(taxs)
-    raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
-    taxs = [taxs] unless taxs.is_a?(Array)
-    taxs = taxs.collect{|t| t.to_s}
-    data = {}
-    tmp = TmpFile.tmp_file("entrez-")
-    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
-    data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
-    FileUtils.rm tmp
-    data
-  end
-  # This class parses an xml containing the information for a particular
-  # gene as served by Entrez Gene, and hold some of its information.
-  class Gene
-    attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
-    def initialize(xml)
-      return if xml.nil?
-      @organism    = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
-      @symbol      = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
-      @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
-      @aka         = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
-      @protnames   = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
-      @summary     = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
-      @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
-    end
-    # Joins the text from symbol, description, aka, protnames, and
-    # summary
-    def text
-      #[@organism, @symbol, @description, @aka,  @protnames, @summary,@comentaries.join(". ")].join(". ")
-      [@symbol, @description, @aka,  @protnames, @summary].flatten.join(". ")
-    end
-  end
-  private
-  @@last = Time.now
-  @@entrez_lag = 1
-  def self.get_online(geneids)
-    geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
-    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
-    diff = Time.now - @@last
-    sleep @@entrez_lag - diff unless diff > @@entrez_lag
-    xml = Open.read(url, :quiet => true, :nocache => true)
-    @@last = Time.now
-    genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
-    if geneids.is_a? Array
-      list = {}
-      genes.each_with_index{|gene,i|
-        #geneid  = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
-        geneid = geneids[i]
-        list[geneid ] = gene
-      }
-      return list
-    else
-      return genes.first
-    end
-  end
-  public
-  # Build a file name for a gene based on the id. Prefix the id by 'gene-',
-  # substitute the slashes with '_SLASH_', and add a '.xml' extension.
-  def self.gene_filename(id)
-    FileCache.clean_path('gene-' + id.to_s + '.xml')
-  end
-  # Returns a Gene object for the given Entrez Gene id. If an array of
-  # ids is given instead, a hash is returned. This method uses the
-  # caching facilities from Rbbt.
-  def self.get_gene(geneid)
-    return nil if geneid.nil?
-    if Array === geneid
-      missing = []
-      list = {}
-      geneid.each{|p|
-        next if p.nil?
-        filename = gene_filename p
-        if File.exists? FileCache.path(filename)
-          list[p] = Gene.new(Open.read(FileCache.path(filename)))
-        else
-          missing << p
-        end
-      }
-      return list unless missing.any?
-      genes = get_online(missing)
-      genes.each{|p, xml|
-        filename = gene_filename p
-        FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
-        list[p] =  Gene.new(xml)
-      }
-      return list
-    else
-      filename = gene_filename geneid
-      if File.exists? FileCache.path(filename)
-        return Gene.new(Open.read(FileCache.path(filename)))
-      else
-        xml = get_online(geneid)
-        FileCache.add_file(filename,xml)
-        return Gene.new(xml)
-      end
-    end
-  end
-  # Counts the words in common between a chunk of text and the text
-  # found in Entrez Gene for that particular gene. The +gene+ may be a
-  # gene identifier or a Gene class instance.
-  def self.gene_text_similarity(gene, text)
-    case
-    when Entrez::Gene === gene
-      gene_text = gene.text
-    when String === gene || Fixnum === gene
-      gene_text =  get_gene(gene).text
-    else
-      return 0
-    end
-    gene_words = gene_text.words.to_set
-    text_words = text.words.to_set
-    return 0 if gene_words.empty? || text_words.empty?
-    common = gene_words.intersection(text_words)
-    common.length / (gene_words.length + text_words.length).to_f
-  end
-end

data/lib/rbbt/sources/go.rb DELETED

@@ -1,85 +0,0 @@
-require 'rbbt'
-# This module holds helper methods to deal with the Gene Ontology files. Right
-# now all it does is provide a translation form id to the actual names.
-module GO
-  @@info = nil
-  MULTIPLE_VALUE_FIELDS = %w(is_a)
-  # This method needs to be called before any translations can be made, it is
-  # called automatically the first time the id2name method is called. It loads
-  # the gene_ontology.obo file and extracts all the fields, although right now,
-  # only the name field is used.
-  def self.init
-    @@info = {}
-    File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
-      split(/\[Term\]/).
-      each{|term|
-        term_info = {}
-        term.split(/\n/).
-          select{|l| l =~ /:/}.
-          each{|l|
-            key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
-            if MULTIPLE_VALUE_FIELDS.include? key.strip
-              term_info[key.strip] ||= []
-              term_info[key.strip] << value.strip
-            else
-              term_info[key.strip] = value.strip
-            end
-          }
-        @@info[term_info["id"]] = term_info
-    }
-  end
-  def self.info
-    self.init unless @@info
-    @@info
-  end
-  def self.goterms
-    self.init unless @@info
-    @@info.keys
-  end
-  def self.id2name(id)
-    self.init unless @@info
-    if id.kind_of? Array
-      @@info.values_at(*id).collect{|i| i['name'] if i}
-    else
-      return nil if @@info[id].nil?
-      @@info[id]['name']
-    end
-  end
-  def self.id2ancestors(id)
-    self.init unless @@info
-    if id.kind_of? Array
-      @@info.values_at(*id).
-        select{|i| ! i['is_a'].nil?}.
-        collect{|i| i['is_a'].collect{|id|
-          id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
-        }.compact
-      }
-    else
-      return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
-      @@info[id]['is_a'].
-        collect{|id|
-        id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
-      }.compact
-    end
-  end
-  def self.id2namespace(id)
-    self.init unless @@info
-    if id.kind_of? Array
-      @@info.values_at(*id).collect{|i| i['namespace'] if i}
-    else
-      return nil if @@info[id].nil?
-      @@info[id]['namespace']
-    end
-  end
-end

data/lib/rbbt/sources/gscholar.rb DELETED

@@ -1,74 +0,0 @@
-require 'mechanize'
-module GoogleScholar
-  def self.user_agent
-    @@a ||= Mechanize.new
-  end
-  def self.citation_link(title)
-    citation_link = nil
-    # Get citation page
-    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
-      article = page.search('div[@class=gs_r]').first
-      return nil if article.nil?
-      return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
-    end
-  end
-  def self.full_text_url(title)
-    full_text_link = nil
-    # Get page
-    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
-      article = page.search('div[@class=gs_r]').first
-      return nil if article.nil?
-      link =  article.search('a').select{ |link|
-        link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
-      }.first
-      return nil if link.nil?
-      return link['href']
-    end
-  end
-  def self.number_cites(title)
-    link = citation_link title
-    return 0 if link.nil?
-    link.inner_html =~ /(\d+)$/
-    return $1.to_i
-  end
-end
-#def get_citers(title)
-#  puts title
-#  citation_link = nil
-#
-#  # Get citation page
-#  $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
-#    citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
-#  end
-#
-#  return [] if citation_link.nil?
-#
-#  # Parse citations
-#
-#  citers = []
-#  $a.get("http://scholar.google.es" + citation_link['href']) do |page|
-#    citers = page.search('div[@class=gs_r]').collect do |entry|
-#      entry.search('h3').first.search('a').first.inner_html
-#    end
-#  end
-#
-#  return citers
-#end