RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/lib/rbbt/sources/polysearch.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/ner/regexpNER'
+# Find terms in the Polysearch thesauri using simple regular expression
+# matching. Note that the first time the methods are used the correspondent
+# thesaurus are loaded into memory. The available thesauri are: disease, drug,
+# metabolite, organ, subcellular (subcellular localization) and tissue.
+module Polysearch
+  @@names = {}
+  def self.type_names(type) #:nodoc:
+    @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
+  end
+  @@indexes = {}
+  def self.type_index(type) #:nodoc:
+    @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
+  end
+  # Find matches in a string of text, the types array specifies which thesauri
+  # to use, if if nil it will use all.
+  def self.match(text, types = nil)
+    if types.nil?
+      types = %w(disease  drug  metabolite  organ  subcellular  tissue)
+    end
+    types = [types] unless Array === types
+    types = types.sort
+    matches = {}
+    types.collect{|type|
+      matches.merge!(type_index(type).match_hash(text))
+    }
+    matches
+  end
+  # Transform the code into a name, type is the thesaurus to use
+  def self.name(type, code)
+    type_names(type)[code]
+  end
+end
+if __FILE__ == $0
+    text =<<-EOT
+     Background  Microorganisms adapt their transcriptome by integrating
+     multiple chemical and physical signals from their environment. Shake-flask
+    cultivation does not allow precise manipulation of individual culture
+    parameters and therefore precludes a quantitative analysis of the
+    (combinatorial) influence of these parameters on transcriptional
+    regulation. Steady-state chemostat cultures, which do enable accurate
+    control, measurement and manipulation of individual cultivation parameters
+    (e.g. specific growth rate, temperature, identity of the growth-limiting
+     nutrient) appear to provide a promising experimental platform for such a
+     combinatorial analysis. Results  A microarray compendium of 170
+     steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
+     presented and analyzed. The 170 microarrays encompass 55 unique
+     conditions, which can be characterized by the combined settings of 10
+     different cultivation parameters. By applying a regression model to assess
+     the impact of (combinations of) cultivation parameters on the
+     transcriptome, most S. cerevisiae genes were shown to be influenced by
+     multiple cultivation parameters, and in many cases by combinatorial
+     effects of cultivation parameters. The inclusion of these combinatorial
+     effects in the regression model led to higher explained variance of the
+     gene expression patterns and resulted in higher function enrichment in
+     subsequent analysis. We further demonstrate the usefulness of the
+     compendium and regression analysis for interpretation of shake-flask-based
+     transcriptome studies and for guiding functional analysis of
+     (uncharacterized) genes and pathways. Conclusions  Modeling the
+     combinatorial effects of environmental parameters on the transcriptome is
+     crucial for understanding transcriptional regulation. Chemostat
+     cultivation offers a powerful tool for such an approach. Keywords:
+       chemostat steady state samples
+    Cerebellar
+    stroke syndrome
+    EOT
+    p Polysearch.match(text,'disease').values.flatten
+end

data/lib/rbbt/sources/pubmed.rb ADDED Viewed

@@ -0,0 +1,111 @@
+require 'rbbt/util/filecache'
+require 'rbbt/util/open'
+require 'rbbt'
+# This module offers an interface with PubMed, to perform queries, and
+# retrieve simple information from articles. It uses the caching
+# services of Rbbt.
+module PubMed
+  private
+  @@last = Time.now
+  @@pubmed_lag = 1
+  def self.get_online(pmids)
+    pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
+    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
+    diff = Time.now - @@last
+    sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
+    xml = Open.read(url, :quiet => true, :nocache => true)
+    @@last = Time.now
+    articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
+    if pmids.is_a? Array
+      list = {}
+      articles.each{|article|
+        pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
+        list[pmid] = article
+      }
+      return list
+    else
+      return articles.first
+    end
+  end
+  public
+  # Processes the xml with an articles as served by MedLine and extracts
+  # the abstract, title and journal information
+  class Article
+    attr_reader :title, :abstract, :journal
+    def initialize(xml)
+      xml ||= ""
+      @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
+      @title    = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
+      @journal  = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
+    end
+    # Join the text from title and abstract
+    def text
+      [@title, @abstract].join("\n")
+    end
+  end
+  # Returns the Article object containing the information for the PubMed
+  # ID specified as an argument. If +pmid+ is an array instead of a single
+  # identifier it returns an hash with the Article object for each id.
+  # It uses the Rbbt cache to save the articles xml.
+  def self.get_article(pmid)
+    if pmid.is_a? Array
+      missing = []
+      list = {}
+      pmid.each{|p|
+        filename = p.to_s + '.xml'
+        if File.exists? FileCache.path(filename)
+          list[p] = Article.new(Open.read(FileCache.path(filename)))
+        else
+          missing << p
+        end
+      }
+      return list unless missing.any?
+      articles = get_online(missing)
+      articles.each{|p, xml|
+        filename = p + '.xml'
+        FileCache.add_file(filename,xml, :force => true)
+        list[p] =  Article.new(xml)
+      }
+      return list
+    else
+      filename = pmid.to_s + '.xml'
+      if File.exists? FileCache.path(filename)
+        return Article.new(Open.read(FileCache.path(filename)))
+      else
+        xml = get_online(pmid)
+        FileCache.add_file(filename,xml)
+        return Article.new(xml)
+      end
+    end
+  end
+  # Performs the specified query and returns an array with the PubMed
+  # Ids returned. +retmax+ can be used to limit the number of ids
+  # returned, if is not specified 30000 is used.
+  def self.query(query, retmax=nil)
+    retmax ||= 30000
+    Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
+  end
+end

data/lib/rbbt/util/arrayHash.rb ADDED Viewed

@@ -0,0 +1,255 @@
+class ArrayHash
+  # Take two strings of elements separated by the character sep_char and join them
+  # into one, removing repetitions.
+  def self.merge_values_string(list1, list2, sep_char ='|')
+    elem1 = list1.to_s.split(sep_char)
+    elem2 = list2.to_s.split(sep_char)
+    (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
+  end
+  # Merge two lists of elements. Elements could be strings of elements
+  # separated by the character sep_char, or arrays of lists of such strings.
+  def self.merge_values(list1, list2, sep_char = "|")
+    if String === list1 || String === list2
+      return merge_values_string(list1, list2)
+    end
+    if list1.nil?
+      list1 = [''] * list2.length
+    end
+    if list2.nil?
+      list2 = [''] * list1.length
+    end
+    new = []
+    list1.each_with_index{|elem, i|
+      new << merge_values_string(elem, list2[i], sep_char)
+    }
+    new
+  end
+  # Take an hash of arrays and a position and use the value at that position
+  # of the arrays and build a new hash with that value as key, and the original
+  # key prepended to the arrays. The options hash appcepts the following keys
+  # :case_insensitive, which defaults to true, and :index, which indicates that
+  # the original key should be the value of the hash entry, instead of the
+  # complete array of values.
+  def self.pullout(hash, pos, options = {})
+    index = options[:index]; index = false if index.nil?
+    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
+    new = {}
+    hash.each{|key,values|
+      code = values[pos].to_s
+      next if code == ""
+      if index
+        list = key
+      else
+        list = [key] + values
+        list.delete_at(pos + 1)
+      end
+      code.split("|").each{|c|
+        c = c.downcase if case_insensitive
+        new[c] = merge_values(new[c], list)
+      }
+    }
+    if case_insensitive
+      class << new; self; end.instance_eval{
+        alias_method :old_get, :[]
+        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
+      }
+    end
+    new
+  end
+  # Merge to hashes of arrays. Each hash contains a number of fields for each
+  # entry. The pos1 and pos2 indicate what fields should be used to match
+  # entries, the values for pos1 and pos2 can be an integer indicating the
+  # position in the array or the symbol :main to refer to the key of the hash.
+  # The options hash accepts the key :case_insensitive, which defaults to true.
+  def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
+    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
+    if pos1.to_s.downcase != 'main'
+      index1 = pullout(hash1, pos1, options.merge(:index => true))
+    elsif options[:case_insensitive]
+      new = {}
+      hash1.each{|k,v|
+        new[k.to_s.downcase] = v
+      }
+      class << new; self; end.instance_eval{
+        alias_method :old_get, :[]
+        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
+      }
+      hash1 = new
+    end
+    length1 = hash1.values.first.length
+    length2 = hash2.values.first.length
+    new = {}
+    hash2.each{|key, values|
+      case
+      when pos2.to_s.downcase == 'main'
+        k = key
+        v = values
+      when Fixnum === pos2
+        k = values[pos2]
+        v = values
+        v.delete_at(pos2)
+        v.unshift(key)
+      else
+        raise "Format of second index not understood"
+      end
+      code = (index1.nil? ? k : index1[k])
+      if code
+        code.split('|').each{|c|
+          c = c.to_s.downcase if options[:case_insensitive]
+          new[c] = hash1[c] || [''] * length1
+          new[c] += v
+        }
+      end
+    }
+    hash1.each{|key, values|
+      new[key] ||= values + [''] * length2
+    }
+    new
+  end
+  # For a given hash of arrays, filter the position pos of each array with the
+  # block of code.
+  def self.process(hash, pos, &block)
+    new = {}
+    hash.each{|key, values|
+      v = values
+      v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
+      new[key] = v
+    }
+    new
+  end
+  # Clean structure for repeated values. If the same value apear two times use
+  # eliminate the one that appears latter on the values list (columns of the
+  # ArrayHash are assumed to be sorted for importance) if the appear on the
+  # same position, remove the one with the smaller vale of the code after
+  # turning it into integer.
+  def self.clean(hash, options = {})
+    case_sensitive = options[:case_sensitive]
+    found = {}
+    hash.each{|k, list|
+      list.each_with_index{|values,i|
+        (String === values ? values.split("|") : values).each{|v|
+          v = v.downcase if case_sensitive
+          if found[v].nil?
+            found[v] = [k,i]
+          else
+            last_k, last_i = found[v].values_at(0,1)
+            if last_i > i || (last_i == i && last_k.to_i > k.to_i)
+              found[v] = [k,i]
+            end
+          end
+        }
+      }
+    }
+    new_hash = {}
+    hash.each{|k,list|
+      new_list = []
+      list.each_with_index{|values,i|
+        new_values = []
+        (String === values ? values.split("|") : values).each{|v|
+          found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
+          if found_i == i && found_k == k
+            new_values << v
+          end
+        }
+        new_list << (String === values ? new_values.join("|") : values)
+      }
+      new_hash[k] = new_list
+    }
+    new_hash
+  end
+  attr_reader :main, :fields, :data
+  def initialize(hash, main, fields = nil)
+    @data = hash
+    @main = main.to_s
+    if fields.nil?
+      l = hash.values.first.length
+      fields = []
+      l.times{|i| fields << "F#{i}"}
+    end
+    @fields = fields.collect{|f| f.to_s}
+  end
+  # Wrapper
+  def process(field, &block)
+    pos = self.field_pos(field)
+    @data = ArrayHash.process(self.data, pos, &block)
+    self
+  end
+  # Returns the position of a given field in the value arrays
+  def field_pos(field)
+    return :main if field == :main
+    if field.downcase == self.main.downcase
+      return :main
+    else
+      @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
+    end
+  end
+  # Merge two ArrayHashes using the specified field
+  def merge(other, field = :main, options = {} )
+    field = self.main  if field == :main
+    pos1 = self.field_pos(field)
+    pos2 = other.field_pos(field)
+    new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
+    @data = new
+    if pos2 == :main
+      new_fields = other.fields
+    else
+      new_fields = other.fields
+      new_fields.delete_at(pos2)
+      new_fields.unshift(other.main)
+    end
+    @fields += new_fields
+    self
+  end
+  # Remove a field from the ArrayHash
+  def remove(field)
+    pos = self.field_pos(field)
+    return if pos.nil?
+    @data = self.data.each{|key,values| values.delete_at(pos)}
+    @fields.delete_at(pos)
+    self
+  end
+  def clean
+    @data = ArrayHash.clean(@data)
+    self
+  end
+end

data/lib/rbbt/util/filecache.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'fileutils'
+require 'rbbt'
+# Provides caching functionality for files downloaded from the internet
+module FileCache
+  class BadPathError    < StandardError; end
+  class FileExistsError < StandardError; end
+  private
+  # Remove slash characters from filename.
+  def self.clean_path(filename)
+    filename.gsub(/\//,'_SLASH_')
+  end
+  # Check that the file name is safe and is in the correct format
+  def self.sanity_check(filename)
+    if filename =~ /\//
+      raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
+    end
+    if filename !~ /.+\..+/
+      raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
+    end
+  end
+  public
+  # Find the path that a particular file would have in the cache
+  def self.path(filename)
+    sanity_check(filename)
+    name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
+    dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
+    return File.join(File.join(Rbbt.cachedir,dirs),filename)
+  end
+  # Add a file in the cache. Raise exception if exists, unless force is
+  # used.
+  def self.add_file(filename, content, options = {})
+    sanity_check(filename)
+    path = path(filename)
+    FileUtils.makedirs(File.dirname(path), :mode => 0777)
+    if File.exist?(path) and ! (options[:force] || options['force'])
+      raise FileCache::FileExistsError, "File #{filename} already in cache"
+    end
+    File.open(path,'w'){|f|
+      f.write(content)
+    }
+    FileUtils.chmod 0666, path
+    nil
+  end
+  # Removes the file from cache
+  def self.del_file(filename)
+    sanity_check(filename)
+    path = path(filename)
+    if File.exist? path
+      FileUtils.rm path
+    end
+    nil
+  end
+end

data/lib/rbbt/util/index.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require 'rbbt/util/open'
+require 'rbbt/util/arrayHash'
+module Index
+  # Creates an inverse index. Takes a file with rows of elements
+  # separated by a given pattern (specified by +sep+) and returns a hash
+  # where each element points to the first element in the row. +lexicon+
+  # is the file containing the data.
+  def self.index(lexicon, options = {})
+    options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
+    data = Open.to_hash(lexicon, options)
+    if options[:clean]
+      data = ArrayHash.clean(data)
+    end
+    index = {}
+    data.each{|code, id_lists|
+      next if code.nil? || code == ""
+      id_lists.flatten.compact.uniq.each{|id|
+        id = id.downcase unless options[:case_sensitive]
+        index[id] = code
+      }
+    }
+    data.each{|code, id_lists|
+      next if code.nil? || code == ""
+      id = code
+      id = id.downcase unless options[:case_sensitive]
+      index[id] = code
+    }
+    if !options[:case_sensitive]
+      class << index; self; end.instance_eval{
+        alias_method :old_get, :[]
+        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
+      }
+    end
+    index
+  end
+end
+if __FILE__ == $0
+  require 'benchmark'
+  normal = nil
+  puts "Normal " + Benchmark.measure{
+    normal = Index.index('/home/miki/rbbt/data/organisms/human/identifiers',:trie => false, :case_sensitive => false)
+  }.to_s
+  ids = Open.read('/home/miki/git/MARQ/test/GDS1375_malignant_vs_normal_up.genes').collect{|l| l.chomp.strip.upcase}
+  new = nil
+  puts ids.inspect
+  puts "normal " + Benchmark.measure{
+    100.times{
+      new = ids.collect{|id| normal[id]}
+    }
+  }.to_s
+  puts new.inspect
+end

data/lib/rbbt/util/misc.rb ADDED Viewed

@@ -0,0 +1,101 @@
+require 'rbbt'
+require 'rbbt/util/open'
+$consonants = Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).collect{|l| l.chomp}.uniq
+class String
+  # Uses heuristics to checks if a string seems like a special word, like a gene name.
+  def is_special?
+    # Only consonants
+    return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
+    # Not a word
+    return false if self =~ /[^\s]\s[^\s]/;
+    return false if self.length < 3;
+    # Alphanumeric
+    return true if self =~ /[0-9]/ &&  self =~ /[a-z]/i
+    # All Caps
+    return true if self =~ /[A-Z]{2,}/;
+    # Caps Mix
+    return true if self =~ /[a-z][A-Z]/;
+    # All consonants
+    return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
+    # Dashed word
+    return true if self =~ /(^\w-|-\w$)/
+    # To many consonants (very heuristic)
+    if self =~ /([^aeiouy]{3,})/i && !$consonants.include?($1.downcase)
+      return true
+    end
+    return false
+  end
+  # Turns the first letter to lowercase
+  def downcase_first
+    return "" if self == ""
+    letters = self.scan(/./)
+    letters[0].downcase!
+    letters.join("")
+  end
+  # Turns a roman number into arabic form is possible. Just simple
+  # romans only...
+  def arabic
+    return 1 if self =~ /^I$/;
+    return 2 if self =~ /^II$/;
+    return 3 if self =~ /^III$/;
+    return 4 if self =~ /^IV$/;
+    return 5 if self =~ /^V$/;
+    return 10 if self =~ /^X$/;
+    return nil
+  end
+end
+$greek = {
+    "alpha"   => "a",
+    "beta"    => "b",
+    "gamma"   => "g",
+    "delta"   => "d",
+    "epsilon" => "e",
+    "zeta"    => "z",
+    "eta"     => "e",
+    "theta"   => "th",
+    "iota"    => "i",
+    "kappa"   => "k",
+    "lambda"  => "l",
+    "mu"      => "m",
+    "nu"      => "n",
+    "xi"      => "x",
+    "omicron" => "o",
+    "pi"      => "p",
+    "rho"     => "r",
+    "sigma"   => "s",
+    "tau"     => "t",
+    "upsilon" => "u",
+    "phi"     => "ph",
+    "chi"     => "ch",
+    "psi"     => "ps",
+    "omega"   => "o"
+}
+$inverse_greek = Hash.new
+$greek.each{|l,s| $inverse_greek[s] = l }
+$stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/)
+class Array
+  # Divides the array into +num+ chunks of the same size by placing one
+  # element in each chunk iteratively.
+  def chunk(num)
+    chunks = []
+    each_with_index{|e, i|
+      c = i % num
+      chunks[c] ||=[]
+      chunks[c] << e
+    }
+    chunks
+  end
+end