RubyGems - rbbt - Versions diffs - 1.1.7 → 2.0.0 - Mend

rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +72 -136
data/LICENSE +0 -20
data/bin/rbbt_config +0 -246
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -145
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -79
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/cgd.Rakefile +0 -84
data/install_scripts/organisms/human.Rakefile +0 -145
data/install_scripts/organisms/mgi.Rakefile +0 -77
data/install_scripts/organisms/pombe.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -258
data/install_scripts/organisms/rgd.Rakefile +0 -88
data/install_scripts/organisms/sgd.Rakefile +0 -66
data/install_scripts/organisms/tair.Rakefile +0 -54
data/install_scripts/organisms/worm.Rakefile +0 -109
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -86
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -40
data/lib/rbbt/sources/organism.rb +0 -245
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -111
data/lib/rbbt/util/arrayHash.rb +0 -255
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -235
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -19
data/tasks/install.rake +0 -124

data/lib/rbbt/sources/polysearch.rb DELETED

@@ -1,117 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/ner/regexpNER'
-require 'rbbt/ner/dictionaryNER'
-# Find terms in the Polysearch thesauri using simple regular expression
-# matching. Note that the first time the methods are used the correspondent
-# thesaurus are loaded into memory. The available thesauri are: disease, drug,
-# metabolite, organ, subcellular (subcellular localization) and tissue.
-module Polysearch
-  @@names = {}
-  def self.type_names(type) #:nodoc:
-    @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
-  end
-  @@indexes = {}
-  def self.type_index(type) #:nodoc:
-    if $stopwords
-      stopwords = $stopwords
-    else
-      stopwords = []
-    end
-    case type.to_sym
-    when :disease
-      stopwords << 'use'
-    end
-    @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
-  end
-  # Find matches in a string of text, the types array specifies which thesauri
-  # to use, if if nil it will use all.
-  def self.match(text, types = nil)
-    if types.nil?
-      types = %w(disease  drug  metabolite  organ  subcellular  tissue)
-    end
-    types = [types] unless Array === types
-    types = types.sort
-    matches = {}
-    types.collect{|type|
-      matches.merge!(type_index(type).match(text))
-    }
-    matches
-  end
-  # Transform the code into a name, type is the thesaurus to use
-  def self.name(type, code)
-    type_names(type)[code]
-  end
-end
-if __FILE__ == $0
-    text =<<-EOT
-     Background  Microorganisms adapt their transcriptome by integrating
-     multiple chemical and physical signals from their environment. Shake-flask
-     cultivation does not allow precise manipulation of individual culture
-     parameters and therefore precludes a quantitative analysis of the
-     (combinatorial) influence of these parameters on transcriptional
-     regulation. Steady-state chemostat cultures, which do enable accurate
-     control, measurement and manipulation of individual cultivation parameters
-     (e.g. specific growth rate, temperature, identity of the growth-limiting
-     nutrient) appear to provide a promising experimental platform for such a
-     combinatorial analysis. Results  A microarray compendium of 170
-     steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
-     presented and analyzed. The 170 microarrays encompass 55 unique
-     conditions, which can be characterized by the combined settings of 10
-     different cultivation parameters. By applying a regression model to assess
-     the impact of (combinations of) cultivation parameters on the
-     transcriptome, most S. cerevisiae genes were shown to be influenced by
-     multiple cultivation parameters, and in many cases by combinatorial
-     effects of cultivation parameters. The inclusion of these combinatorial
-     effects in the regression model led to higher explained variance of the
-     gene expression patterns and resulted in higher function enrichment in
-     subsequent analysis. We further demonstrate the usefulness of the
-     compendium and regression analysis for interpretation of shake-flask-based
-     transcriptome studies and for guiding functional analysis of
-     (uncharacterized) genes and pathways. Conclusions  Modeling the
-     combinatorial effects of environmental parameters on the transcriptome is
-     crucial for understanding transcriptional regulation. Chemostat
-     cultivation offers a powerful tool for such an approach. Keywords:
-     chemostat steady state samples Cerebellar stroke syndrome
-    EOT
-    require 'benchmark'
-    require 'ruby-prof'
-    puts Benchmark.measure{
-      p Polysearch.match(text,'disease')
-    }
-    RubyProf.start
-    Polysearch.match(text,'disease')
-    result = RubyProf.stop
-    # Print a flat profile to text
-    printer = RubyProf::FlatPrinter.new(result)
-    printer.print(STDOUT, 0)
-    puts Benchmark.measure{
-      10.times{ p Polysearch.match(text,'disease') }
-    }
-end

data/lib/rbbt/sources/pubmed.rb DELETED

@@ -1,111 +0,0 @@
-require 'rbbt/util/filecache'
-require 'rbbt/util/open'
-require 'rbbt'
-# This module offers an interface with PubMed, to perform queries, and
-# retrieve simple information from articles. It uses the caching
-# services of Rbbt.
-module PubMed
-  private
-  @@last = Time.now
-  @@pubmed_lag = 1
-  def self.get_online(pmids)
-    pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
-    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
-    diff = Time.now - @@last
-    sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
-    xml = Open.read(url, :quiet => true, :nocache => true)
-    @@last = Time.now
-    articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
-    if pmids.is_a? Array
-      list = {}
-      articles.each{|article|
-        pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
-        list[pmid] = article
-      }
-      return list
-    else
-      return articles.first
-    end
-  end
-  public
-  # Processes the xml with an articles as served by MedLine and extracts
-  # the abstract, title and journal information
-  class Article
-    attr_reader :title, :abstract, :journal
-    def initialize(xml)
-      xml ||= ""
-      @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
-      @title    = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
-      @journal  = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
-    end
-    # Join the text from title and abstract
-    def text
-      [@title, @abstract].join("\n")
-    end
-  end
-  # Returns the Article object containing the information for the PubMed
-  # ID specified as an argument. If +pmid+ is an array instead of a single
-  # identifier it returns an hash with the Article object for each id.
-  # It uses the Rbbt cache to save the articles xml.
-  def self.get_article(pmid)
-    if pmid.is_a? Array
-      missing = []
-      list = {}
-      pmid.each{|p|
-        filename = p.to_s + '.xml'
-        if File.exists? FileCache.path(filename)
-          list[p] = Article.new(Open.read(FileCache.path(filename)))
-        else
-          missing << p
-        end
-      }
-      return list unless missing.any?
-      articles = get_online(missing)
-      articles.each{|p, xml|
-        filename = p + '.xml'
-        FileCache.add_file(filename,xml, :force => true)
-        list[p] =  Article.new(xml)
-      }
-      return list
-    else
-      filename = pmid.to_s + '.xml'
-      if File.exists? FileCache.path(filename)
-        return Article.new(Open.read(FileCache.path(filename)))
-      else
-        xml = get_online(pmid)
-        FileCache.add_file(filename,xml)
-        return Article.new(xml)
-      end
-    end
-  end
-  # Performs the specified query and returns an array with the PubMed
-  # Ids returned. +retmax+ can be used to limit the number of ids
-  # returned, if is not specified 30000 is used.
-  def self.query(query, retmax=nil)
-    retmax ||= 30000
-    Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
-  end
-end

data/lib/rbbt/util/arrayHash.rb DELETED

@@ -1,255 +0,0 @@
-class ArrayHash
-  # Take two strings of elements separated by the character sep_char and join them
-  # into one, removing repetitions.
-  def self.merge_values_string(list1, list2, sep_char ='|')
-    elem1 = list1.to_s.split(sep_char)
-    elem2 = list2.to_s.split(sep_char)
-    (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
-  end
-  # Merge two lists of elements. Elements could be strings of elements
-  # separated by the character sep_char, or arrays of lists of such strings.
-  def self.merge_values(list1, list2, sep_char = "|")
-    if String === list1 || String === list2
-      return merge_values_string(list1, list2)
-    end
-    if list1.nil?
-      list1 = [''] * list2.length
-    end
-    if list2.nil?
-      list2 = [''] * list1.length
-    end
-    new = []
-    list1.each_with_index{|elem, i|
-      new << merge_values_string(elem, list2[i], sep_char)
-    }
-    new
-  end
-  # Take an hash of arrays and a position and use the value at that position
-  # of the arrays and build a new hash with that value as key, and the original
-  # key prepended to the arrays. The options hash appcepts the following keys
-  # :case_insensitive, which defaults to true, and :index, which indicates that
-  # the original key should be the value of the hash entry, instead of the
-  # complete array of values.
-  def self.pullout(hash, pos, options = {})
-    index = options[:index]; index = false if index.nil?
-    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
-    new = {}
-    hash.each{|key,values|
-      code = values[pos].to_s
-      next if code == ""
-      if index
-        list = key
-      else
-        list = [key] + values
-        list.delete_at(pos + 1)
-      end
-      code.split("|").each{|c|
-        c = c.downcase if case_insensitive
-        new[c] = merge_values(new[c], list)
-      }
-    }
-    if case_insensitive
-      class << new; self; end.instance_eval{
-        alias_method :old_get, :[]
-        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
-      }
-    end
-    new
-  end
-  # Merge to hashes of arrays. Each hash contains a number of fields for each
-  # entry. The pos1 and pos2 indicate what fields should be used to match
-  # entries, the values for pos1 and pos2 can be an integer indicating the
-  # position in the array or the symbol :main to refer to the key of the hash.
-  # The options hash accepts the key :case_insensitive, which defaults to true.
-  def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
-    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
-    if pos1.to_s.downcase != 'main'
-      index1 = pullout(hash1, pos1, options.merge(:index => true))
-    elsif options[:case_insensitive]
-      new = {}
-      hash1.each{|k,v|
-        new[k.to_s.downcase] = v
-      }
-      class << new; self; end.instance_eval{
-        alias_method :old_get, :[]
-        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
-      }
-      hash1 = new
-    end
-    length1 = hash1.values.first.length
-    length2 = hash2.values.first.length
-    new = {}
-    hash2.each{|key, values|
-      case
-      when pos2.to_s.downcase == 'main'
-        k = key
-        v = values
-      when Fixnum === pos2
-        k = values[pos2]
-        v = values
-        v.delete_at(pos2)
-        v.unshift(key)
-      else
-        raise "Format of second index not understood"
-      end
-      code = (index1.nil? ? k : index1[k])
-      if code
-        code.split('|').each{|c|
-          c = c.to_s.downcase if options[:case_insensitive]
-          new[c] = hash1[c] || [''] * length1
-          new[c] += v
-        }
-      end
-    }
-    hash1.each{|key, values|
-      new[key] ||= values + [''] * length2
-    }
-    new
-  end
-  # For a given hash of arrays, filter the position pos of each array with the
-  # block of code.
-  def self.process(hash, pos, &block)
-    new = {}
-    hash.each{|key, values|
-      v = values
-      v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
-      new[key] = v
-    }
-    new
-  end
-  # Clean structure for repeated values. If the same value apear two times use
-  # eliminate the one that appears latter on the values list (columns of the
-  # ArrayHash are assumed to be sorted for importance) if the appear on the
-  # same position, remove the one with the smaller vale of the code after
-  # turning it into integer.
-  def self.clean(hash, options = {})
-    case_sensitive = options[:case_sensitive]
-    found = {}
-    hash.each{|k, list|
-      list.each_with_index{|values,i|
-        (String === values ? values.split("|") : values).each{|v|
-          v = v.downcase if case_sensitive
-          if found[v].nil?
-            found[v] = [k,i]
-          else
-            last_k, last_i = found[v].values_at(0,1)
-            if last_i > i || (last_i == i && last_k.to_i > k.to_i)
-              found[v] = [k,i]
-            end
-          end
-        }
-      }
-    }
-    new_hash = {}
-    hash.each{|k,list|
-      new_list = []
-      list.each_with_index{|values,i|
-        new_values = []
-        (String === values ? values.split("|") : values).each{|v|
-          found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
-          if found_i == i && found_k == k
-            new_values << v
-          end
-        }
-        new_list << (String === values ? new_values.join("|") : values)
-      }
-      new_hash[k] = new_list
-    }
-    new_hash
-  end
-  attr_reader :main, :fields, :data
-  def initialize(hash, main, fields = nil)
-    @data = hash
-    @main = main.to_s
-    if fields.nil?
-      l = hash.values.first.length
-      fields = []
-      l.times{|i| fields << "F#{i}"}
-    end
-    @fields = fields.collect{|f| f.to_s}
-  end
-  # Wrapper
-  def process(field, &block)
-    pos = self.field_pos(field)
-    @data = ArrayHash.process(self.data, pos, &block)
-    self
-  end
-  # Returns the position of a given field in the value arrays
-  def field_pos(field)
-    return :main if field == :main
-    if field.downcase == self.main.downcase
-      return :main
-    else
-      @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
-    end
-  end
-  # Merge two ArrayHashes using the specified field
-  def merge(other, field = :main, options = {} )
-    field = self.main  if field == :main
-    pos1 = self.field_pos(field)
-    pos2 = other.field_pos(field)
-    new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
-    @data = new
-    if pos2 == :main
-      new_fields = other.fields
-    else
-      new_fields = other.fields
-      new_fields.delete_at(pos2)
-      new_fields.unshift(other.main)
-    end
-    @fields += new_fields
-    self
-  end
-  # Remove a field from the ArrayHash
-  def remove(field)
-    pos = self.field_pos(field)
-    return if pos.nil?
-    @data = self.data.each{|key,values| values.delete_at(pos)}
-    @fields.delete_at(pos)
-    self
-  end
-  def clean
-    @data = ArrayHash.clean(@data)
-    self
-  end
-end