RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/lib/rbbt/sources/biocreative.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require 'rbbt'
+require 'rbbt/util/open'
+# Offers methods to help deal with the files distributed for the BioCreative
+# competition related to Gene Mention and Normalization.
+module Biocreative
+  # Read the files regarding the dataset and return a hash with the entry codes
+  # as keys and as values a hash with :text and the :mentions for that entry
+  def self.BC2GM(dataset)
+    data = {}
+    Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each{|l|
+      code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
+      data[code] ={ :text => text }
+    }
+    Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each{|l|
+      code, pos, mention = l.chomp.split(/\|/)
+      data[code] ||= {}
+      data[code][:mentions] ||= []
+      data[code][:mentions].push(mention)
+    }
+    data
+  end
+  # Given a string of text and a string with a mention, return positions for
+  # that mention in the format used in the evaluation.
+  def self.position(text, mention)
+    re = mention.gsub(/\W+/,' ')
+    re = Regexp.quote(re)
+    re = re.gsub(/\\ /,'\W*')
+    re = '\(?' + re if mention =~ /\)/
+    re = re + '\)?' if mention =~ /\(/
+    re = "'?" + re + "'?" if mention =~ /'/
+    positions = []
+    offset = 0
+    while text.match(/(.*?)(#{re})(.*)/s)
+      pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
+      start                     = offset  + pre.gsub(/\s/,'').length
+      last                      = offset  + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
+      positions << [start, last]
+      offset                    = last + 1
+      text                      = post
+      end
+    return positions
+  end
+  # Run the evaluation perl script
+  def self.BC2GM_eval(results, dataset, outfile)
+    cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
+                         -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
+                         -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
+                          #{results} > #{outfile}"
+    system cmd
+  end
+end

data/lib/rbbt/sources/biomart.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'rbbt/util/open'
+require 'rbbt'
+# This module interacts with BioMart. It performs queries to BioMart and
+# synthesises a hash with the results. Note that this module connects to the
+# online BioMart WS using the Open in 'rbbt/util/open' module which offers
+# caching by default. To obtain up to date results you may need to clear the
+# cache from previous queries.
+module BioMart
+  class BioMart::QueryError < StandardError; end
+  private
+  @@biomart_query_xml = <<-EOT
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+<Dataset name = "<!--DATABASE-->" interface = "default" >
+<!--FILTERS-->
+<!--MAIN-->
+<!--ATTRIBUTES-->
+</Dataset>
+</Query>
+  EOT
+  def self.get(database, main, attrs = nil, filters = nil, data = nil)
+    attrs   ||= []
+    filters ||= ["with_#{main}"]
+    data    ||= {}
+    query = @@biomart_query_xml.clone
+    query.sub!(/<!--DATABASE-->/,database)
+    query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
+    query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
+    query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
+    rows = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
+    if rows =~ /Query ERROR:/
+      raise BioMart::QueryError, rows
+    end
+    rows.each{|l|
+      parts = l.chomp.split(/\t/)
+      main = parts.shift
+      next if main.nil? || main.empty?
+      data[main] ||= {}
+      attrs.each{|name|
+        value = parts.shift
+        data[main][name] ||= []
+        next if value.nil?
+        data[main][name] << value
+      }
+    }
+    data
+  end
+  public
+  # This method performs a query in biomart for a datasets and a given set of
+  # attributes, there must be a main attribute that will be used as the key in
+  # the result hash, optionally there may be a list of additional attributes
+  # and filters. The data parameter at the end is used internally to
+  # incrementally building the result, due to a limitation of the BioMart WS
+  # that only allows 3 external arguments, users normally should leave it
+  # unspecified or nil. The result is a hash, where the keys are the different
+  # values for the main attribute, and the value is a hash with every other
+  # attribute as key, and as value and array with all possible values (Note
+  # that for a given value of the main attribute, there may be more than one
+  # value for another attribute). If filters is left a nil it adds a filter to
+  # the BioMart query to remove results with the main attribute empty, this may
+  # cause an error if the BioMart WS does not allow filtering with that
+  # attribute.
+  def self.query(database, main, attrs = nil, filters = nil, data = nil)
+    attrs   ||= []
+    data    ||= {}
+    chunks = []
+    chunk = []
+    attrs.each{|a|
+      chunk << a
+      if chunk.length == 2
+        chunks << chunk
+        chunk = []
+      end
+    }
+    chunks << chunk if chunk.any?
+    chunks.each{|chunk|
+      data = get(database,main,chunk, filters, data)
+    }
+    data
+  end
+end

data/lib/rbbt/sources/entrez.rb ADDED Viewed

@@ -0,0 +1,211 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/util/tmpfile'
+require 'rbbt/util/filecache'
+require 'rbbt/bow/bow.rb'
+require 'set'
+# This module is used to parse and extract information from the
+# gene_info file at Entrez Gene, as well as from the gene2pubmed file.
+# Both need to be downloaded and accesible for Rbbt, which is done as
+# part of a normal installation.
+module Entrez
+  class NoFileError < StandardError; end
+  # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
+  # where each key is the entrez id of a gene, and the value is an array
+  # of possible synonyms in other databases. Is mostly used to translate
+  # entrez ids to the native database id of the organism. The parameter
+  # +native+ specifies the position of the key containing synonym, the
+  # fifth by default, +fix+ and +check+ are Procs used, if present, to
+  # pre-process lines and to check if they should be processed.
+  def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
+    raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
+    native ||= 5
+    taxs = [taxs] unless taxs.is_a?(Array)
+    taxs = taxs.collect{|t| t.to_s}
+    lexicon = {}
+    tmp = TmpFile.tmp_file("entrez-")
+    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
+    File.open(tmp).each{|l|
+      parts = l.chomp.split(/\t/)
+      next if parts[native] == '-'
+      entrez = parts[1]
+      parts[native].split(/\|/).each{|id|
+        id = fix.call(id) if fix
+        next if check && !check.call(id)
+        lexicon[entrez] ||= []
+        lexicon[entrez] << id
+      }
+    }
+    FileUtils.rm tmp
+    lexicon
+  end
+  # For a given taxonomy, or set of taxonomies, it returns a hash with
+  # genes as keys and arrays of related PubMed ids as values, as
+  # extracted from the gene2pubmed file from Entrez Gene.
+  def self.entrez2pubmed(taxs)
+    raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
+    taxs = [taxs] unless taxs.is_a?(Array)
+    taxs = taxs.collect{|t| t.to_s}
+    data = {}
+    tmp = TmpFile.tmp_file("entrez-")
+    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
+    data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
+    FileUtils.rm tmp
+    data
+  end
+  # This class parses an xml containing the information for a particular
+  # gene as served by Entrez Gene, and hold some of its information.
+  class Gene
+    attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
+    def initialize(xml)
+      return if xml.nil?
+      @organism    = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
+      @symbol      = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
+      @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
+      @aka         = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
+      @protnames   = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
+      @summary     = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
+      @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
+    end
+    # Joins the text from symbol, description, aka, protnames, and
+    # summary
+    def text
+      #[@organism, @symbol, @description, @aka,  @protnames, @summary,@comentaries.join(". ")].join(". ")
+      [@symbol, @description, @aka,  @protnames, @summary].flatten.join(". ")
+    end
+  end
+  private
+  @@last = Time.now
+  @@entrez_lag = 1
+  def self.get_online(geneids)
+    geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
+    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
+    diff = Time.now - @@last
+    sleep @@entrez_lag - diff unless diff > @@entrez_lag
+    xml = Open.read(url, :quiet => true, :nocache => true)
+    @@last = Time.now
+    genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
+    if geneids.is_a? Array
+      list = {}
+      genes.each_with_index{|gene,i|
+        #geneid  = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
+        geneid = geneids[i]
+        list[geneid ] = gene
+      }
+      return list
+    else
+      return genes.first
+    end
+  end
+  public
+  # Build a file name for a gene based on the id. Prefix the id by 'gene-',
+  # substitute the slashes with '_SLASH_', and add a '.xml' extension.
+  def self.gene_filename(id)
+    FileCache.clean_path('gene-' + id.to_s + '.xml')
+  end
+  # Returns a Gene object for the given Entrez Gene id. If an array of
+  # ids is given instead, a hash is returned. This method uses the
+  # caching facilities from Rbbt.
+  def self.get_gene(geneid)
+    return nil if geneid.nil?
+    if Array === geneid
+      missing = []
+      list = {}
+      geneid.each{|p|
+        next if p.nil?
+        filename = gene_filename p
+        if File.exists? FileCache.path(filename)
+          list[p] = Gene.new(Open.read(FileCache.path(filename)))
+        else
+          missing << p
+        end
+      }
+      return list unless missing.any?
+      genes = get_online(missing)
+      genes.each{|p, xml|
+        filename = gene_filename p
+        FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
+        list[p] =  Gene.new(xml)
+      }
+      return list
+    else
+      filename = gene_filename geneid
+      if File.exists? FileCache.path(filename)
+        return Gene.new(Open.read(FileCache.path(filename)))
+      else
+        xml = get_online(geneid)
+        FileCache.add_file(filename,xml)
+        return Gene.new(xml)
+      end
+    end
+  end
+  # Counts the words in common between a chunk of text and the text
+  # found in Entrez Gene for that particular gene. The +gene+ may be a
+  # gene identifier or a Gene class instance.
+  def self.gene_text_similarity(gene, text)
+    case
+    when Entrez::Gene === gene
+      gene_text = gene.text
+    when String === gene || Fixnum === gene
+      gene_text =  get_gene(gene).text
+    else
+      return 0
+    end
+    gene_words = gene_text.words.to_set
+    text_words = text.words.to_set
+    return 0 if gene_words.empty? || text_words.empty?
+    common = gene_words.intersection(text_words)
+    common.length / (gene_words.length + text_words.length).to_f
+  end
+end

data/lib/rbbt/sources/go.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'rbbt'
+# This module holds helper methods to deal with the Gene Ontology files. Right
+# now all it does is provide a translation form id to the actual names.
+module GO
+  @@info = nil
+  # This method needs to be called before any translations can be made, it is
+  # called automatically the first time the id2name method is called. It loads
+  # the gene_ontology.obo file and extracts all the fields, although right now,
+  # only the name field is used.
+  def self.init
+    @@info = {}
+    File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
+      split(/\[Term\]/).
+      each{|term|
+        term_info = {}
+        term.split(/\n/).
+          select{|l| l =~ /:/}.
+          each{|l|
+            key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
+            term_info[key.strip] = value.strip
+          }
+        @@info[term_info["id"]] = term_info
+      }
+  end
+  def self.id2name(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).collect{|i| i['name'] if i}
+    else
+      return "Name not found" unless @@info[id]
+      @@info[id]['name']
+    end
+  end
+end

data/lib/rbbt/sources/organism.rb ADDED Viewed

@@ -0,0 +1,197 @@
+require 'rbbt'
+require 'rbbt/ner/rnorm'
+require 'rbbt/util/open'
+module Organism
+  class OrganismNotProcessedError < StandardError; end
+  def self.all
+    Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/name').collect{|f| File.basename(File.dirname(f))}
+  end
+  def self.name(org)
+    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
+  end
+  NAME2ORG = {}
+  Organism::all.each{|org|
+    name = Organism.name(org).strip.downcase
+    NAME2ORG[name] = org
+  }
+  def self.name2org(name)
+    NAME2ORG[name.strip.downcase]
+  end
+  def self.id_formats(org)
+    id_types = {}
+    formats = supported_ids(org)
+    lines = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).collect
+    lines.each{|l|
+      ids_per_type = l.split(/\t/)
+      formats.zip(ids_per_type).each{|p|
+        format = p[0]
+        ids = p[1].split(/\|/)
+        ids.each{|id|
+          next if id.nil? || id == ""
+          id_types[id.downcase] ||= []
+          id_types[id.downcase] << format unless id_types[id.downcase].include? format
+        }
+      }
+    }
+    return id_types
+  end
+  def self.guessIdFormat(formats, query)
+    query = query.compact.collect{|gene| gene.downcase}.uniq
+    if String === formats
+      formats = id_formats(formats)
+    end
+    return nil if formats.values.empty?
+    values = formats.values_at(*query)
+    return nil if values.empty?
+    format_count = {}
+    values.compact.collect{|types| types.uniq}.flatten.each{|f|
+      format_count[f] ||= 0
+      format_count[f] += 1
+    }
+    return nil if format_count.values.empty?
+    format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
+  end
+  def self.ner(org, type=:abner, options = {})
+    case type.to_sym
+    when :abner
+      require 'rbbt/ner/abner'
+      return Abner.new
+    when :banner
+      require 'rbbt/ner/banner'
+      return Banner.new
+    when :rner
+      require 'rbbt/ner/rner'
+      model = options[:model]
+      model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
+      model ||= File.join(Rbbt.datadir,'ner/model/BC2')
+      return NER.new(model)
+    else
+      raise "Ner type (#{ type }) unknown"
+    end
+  end
+  def self.norm(org, to_entrez = nil)
+    if to_entrez.nil?
+      to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
+    end
+    token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
+    if !File.exists? token_file
+      token_file = nil
+    end
+    Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
+  end
+  def self.lexicon(org, options = {})
+    options[:sep] = "\t|\\|" unless options[:sep]
+    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
+  end
+  def self.goterms(org)
+    goterms = {}
+    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each{|l|
+      gene, go = l.chomp.split(/\t/)
+      goterms[gene.strip] ||= []
+      goterms[gene.strip] << go.strip
+    }
+    goterms
+  end
+  def self.literature(org)
+    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).collect{|l| l.chomp.scan(/\d+/)}.flatten
+  end
+  def self.gene_literature(org)
+    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
+  end
+  def self.gene_literature_go(org)
+    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
+  end
+  def self.supported_ids(org, options = {})
+    formats  = []
+    examples = [] if options[:examples]
+    i= 0
+    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each{|l|
+      if i == 0
+        i += 1
+        next unless l=~/^\s*#/
+          formats  = l.chomp.sub(/^[\s#]+/,'').split(/\t/).collect{|n| n.strip}
+        return formats unless examples
+        next
+      end
+      if l.chomp.split(/\t/).select{|name| name && name =~ /\w/}.length > examples.length
+        examples = l.chomp.split(/\t/).collect{|name| name.split(/\|/).first}
+      end
+      i += 1
+    }
+    formats.zip(examples)
+  end
+  def self.id_position(supported_ids, id_name, options = {})
+    pos = 0
+    supported_ids.each_with_index{|id, i|
+      if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
+        pos = i;
+      end
+    }
+    pos
+  end
+  def self.id_index(org, option = {})
+    native = option[:native]
+    other  = option[:other]
+    option[:case_sensitive] = false if option[:case_sensitive].nil?
+    if native.nil? and other.nil?
+      Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
+    else
+      supported = Organism.supported_ids(org)
+      first = nil
+      if native
+        first = id_position(supported,native,option)
+      else
+        first = 0
+      end
+      rest = nil
+      if other
+        rest = other.collect{|name| id_position(supported,name, option)}
+      else
+        rest = (0..supported.length - 1).to_a - [first]
+      end
+      option[:native] = first
+      option[:extra] = rest
+      index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
+      index
+    end
+  end
+end