RubyGems - rbbt - Versions diffs - 1.0.3 → 1.1.0 - Mend

rbbt 1.0.3 → 1.1.0

Files changed (8) hide show

data/install_scripts/organisms/worm.Rakefile +1 -1
data/lib/rbbt/ner/dictionaryNER.rb +98 -0
data/lib/rbbt/ner/regexpNER.rb +17 -11
data/lib/rbbt/sources/organism.rb +84 -52
data/lib/rbbt/sources/polysearch.rb +34 -15
data/lib/rbbt/util/filecache.rb +1 -1
metadata +3 -3
data/install_scripts/stopwords +0 -1

data/install_scripts/organisms/worm.Rakefile CHANGED Viewed

@@ -96,7 +96,7 @@ module Open
                   else
                     [id, name, extra].join("\t") + "\n"
                   end
-                }
+                }.join("\n")
       elsif url =~ /gene_ids/
         return content.gsub(/,/,"\t")
       else

data/lib/rbbt/ner/dictionaryNER.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# This class loads a dictionary of codes with associated names, it then can
+# find those names in a string of text. It works word-wise.
+class DictionaryNER
+  A_INT   = "a"[0]
+  DOWNCASE_OFFSET = "A"[0] - "a"[0]
+  require 'rbbt/bow/bow'
+  # Divides a string of text into words. A slash separates words, only if the
+  # second one begins with a letter.
+  def self.chunk(text)
+    text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
+  end
+  # Simplify the text to widen the matches. Currently only downcases the keys
+  def self.simplify(text)
+    if text.length > 2 && text[0] < A_INT && text[1] > A_INT
+      text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
+    else
+      return text
+    end
+  end
+  # Given a dictionary structure, find the matches in the text.
+  def self.match(dict, text) #:nodoc:
+    if Array === text
+      words = text
+    else
+      words = chunk(text)
+    end
+    result = {}
+    words.each_with_index{|word, pos|
+      key = simplify(word)
+      next if dict[key].nil?
+      dict[key].each{|entrie|
+        case
+        when String === entrie
+          result[word] ||= []
+          result[word] << entrie unless result[word].include? entrie
+        when Hash === entrie
+          rec_words  = words[(pos + 1)..-1]
+          rec_result = match(entrie, rec_words)
+          rec_result.each{|rec_key, rec_list|
+            composite_key = word + ' ' + rec_key
+            result[composite_key] ||= []
+            result[composite_key] += rec_list
+            result[composite_key].uniq!
+          }
+        end
+      }
+    }
+    result
+  end
+  # Add a name to a structure
+  def self.add_name(dict, name, code)
+    if Array === name
+      words = name
+    else
+      words = chunk(name)
+    end
+    key = simplify(words.shift)
+    if words.empty?
+      dict[key] ||= []
+      dict[key] << code unless dict[key].include? code
+    else
+      rec_dict = {}
+      add_name(rec_dict, words , code)
+      dict[key] ||= []
+      dict[key] << rec_dict
+    end
+  end
+  def self.load(dictionary)
+    dict = {}
+    dictionary = File.open(dictionary).read if File.exists? dictionary
+    dictionary.each_line{|l|
+      names = l.chomp.split(/\t/)
+      code  = names.shift
+      names.each{|name| add_name(dict, name, code) }
+    }
+    dict
+  end
+  def initialize(dictionary)
+    @dict = DictionaryNER.load(dictionary)
+  end
+  def match(text)
+    DictionaryNER.match(@dict, text)
+  end
+end

data/lib/rbbt/ner/regexpNER.rb CHANGED Viewed

@@ -7,27 +7,33 @@ class RegExpNER
     res = [res] unless Array === res
     res.collect{|re|
-      if text.match(re)
-        $1
-      else
-        nil
-      end
-    }.compact
+      text.scan(re)
+    }.flatten
   end
-  def self.build_re(names, ignorecase=true)
+  def self.build_re_old(names, ignorecase=true)
     names.compact.select{|n| n != ""}.
       sort{|a,b| b.length <=> a.length}.
       collect{|n|
         re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
-        /(?:^|[^\w])(#{ re })(?:$|[^\w])/i
       }
   end
+  def self.build_re(names, ignorecase=true)
+    res = names.compact.select{|n| n != ""}.
+      sort{|a,b| b.length <=> a.length}.
+      collect{|n|
+        Regexp.quote(n)
+      }
+    /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
+  end
   def initialize(lexicon, options = {})
-    options[:flatten] = true
+    options[:flatten]    = true
     options[:ignorecase] = true if options[:ignorecase].nil?
-    options[:stopwords] = true if options[:stopwords].nil?
+    options[:stopwords]  = true if options[:stopwords].nil?
     data = Open.to_hash(lexicon, options)
@@ -55,7 +61,7 @@ class RegExpNER
   end
   def match(text)
-    match_hash(text).values.flatten
+    match_hash(text)
   end
 end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -2,13 +2,18 @@ require 'rbbt'
 require 'rbbt/util/open'
 require 'rbbt/util/index'
+# This module contains some Organism centric functionalities. Each organism is
+# identified by a keyword.
 module Organism
+  # Raised when trying to access information for an organism that has not been
+  # prepared already.
   class OrganismNotProcessedError < StandardError; end
-  def self.all(installed = true)
-    if installed
+  # Return the list of all supported organisms. The prepared flag is used to
+  # show only those that have been prepared.
+  def self.all(prepared = true)
+    if prepared
       Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
     else
       Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
@@ -16,72 +21,32 @@ module Organism
   end
+  # Return the complete name of an organism. The org parameter is the organism
+  # keyword
   def self.name(org)
     raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
     Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
   end
+  # Hash linking all the organism log names with their keywords in Rbbt. Its
+  # the inverse of the name method.
   NAME2ORG = {}
   Organism::all.each{|org|
     name = Organism.name(org).strip.downcase
     NAME2ORG[name] = org
   }
+  # Return the key word associated with an organism.
   def self.name2org(name)
     NAME2ORG[name.strip.downcase]
   end
-  def self.id_formats(org)
-    id_types = {}
-    formats = supported_ids(org)
-    text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
-    if text.respond_to? :collect
-      lines = text.collect
-    else
-      lines = text.lines
-    end
-    lines.each{|l|
-      ids_per_type = l.split(/\t/)
-      formats.zip(ids_per_type).each{|p|
-        format = p[0]
-        ids = p[1].split(/\|/)
-        ids.each{|id|
-          next if id.nil? || id == ""
-          id_types[id.downcase] ||= []
-          id_types[id.downcase] << format unless id_types[id.downcase].include? format
-        }
-      }
-    }
-    return id_types
-  end
-  def self.guessIdFormat(formats, query)
-    query = query.compact.collect{|gene| gene.downcase}.uniq
-    if String === formats
-      formats = id_formats(formats)
-    end
-    return nil if formats.values.empty?
-    values = formats.values_at(*query)
-    return nil if values.empty?
-    format_count = {}
-    values.compact.collect{|types| types.uniq}.flatten.each{|f|
-      format_count[f] ||= 0
-      format_count[f] += 1
-    }
-    return nil if format_count.values.empty?
-    format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
-  end
   # FIXME: The NER related stuff is harder to install, thats why we hide the
   # requires next to where they are needed, next to options
+  # Return a NER object which could be of RNER, Abner or Banner class, this is
+  # selected using the type parameter.
   def self.ner(org, type=:rner, options = {})
     case type.to_sym
@@ -103,6 +68,7 @@ module Organism
   end
+  # Return a normalization object.
   def self.norm(org, to_entrez = nil)
     require 'rbbt/ner/rnorm'
     if to_entrez.nil?
@@ -117,11 +83,15 @@ module Organism
     Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
   end
+  # Returns a hash with the names associated with each gene id. The ids are
+  # in Rbbt native format for that organism.
   def self.lexicon(org, options = {})
-    options[:sep] = "\t|\\|" unless options[:sep]
+    options = {:sep => "\t|\\|", :flatten => true}.merge(options)
     Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
   end
+  # Returns a hash with the list of go terms for each gene id. Gene ids are in
+  # Rbbt native format for that organism.
   def self.goterms(org)
     goterms = {}
     Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
@@ -132,18 +102,29 @@ module Organism
     goterms
   end
+  # Return list of PubMed ids associated to the organism. Determined using a
+  # PubMed query with the name of the organism
   def self.literature(org)
     Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
   end
+  # Return hash that associates genes to a list of PubMed ids.
   def self.gene_literature(org)
     Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
   end
+  # Return hash that associates genes to a list of PubMed ids. Includes only
+  # those found to support GO term associations.
   def self.gene_literature_go(org)
     Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
   end
+  # Returns a list with the names of the id formats supported for an organism.
+  # If examples are produced, the list is of [format, example] pairs.
+  #
+  # *Options:*
+  #
+  # *examples:* Include example ids for each format
   def self.supported_ids(org, options = {})
     formats  = []
     examples = [] if options[:examples]
@@ -166,6 +147,57 @@ module Organism
     formats.zip(examples)
   end
+  # Creates a hash where each possible id is associated with the names of the
+  # formats (its potentially possible for different formats to have the same
+  # id). This is used in the guessIdFormat method.
+  def self.id_formats(org)
+    id_types = {}
+    formats = supported_ids(org)
+    text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
+    if text.respond_to? :collect
+      lines = text.collect
+    else
+      lines = text.lines
+    end
+    lines.each{|l|
+      ids_per_type = l.split(/\t/)
+      formats.zip(ids_per_type).each{|p|
+        format = p[0]
+        ids = p[1].split(/\|/)
+        ids.each{|id|
+          next if id.nil? || id == ""
+          id_types[id.downcase] ||= []
+          id_types[id.downcase] << format unless id_types[id.downcase].include? format
+        }
+      }
+    }
+    return id_types
+  end
+  def self.guessIdFormat(formats, query)
+    query = query.compact.collect{|gene| gene.downcase}.uniq
+    if String === formats
+      formats = id_formats(formats)
+    end
+    return nil if formats.values.empty?
+    values = formats.values_at(*query)
+    return nil if values.empty?
+    format_count = {}
+    values.compact.collect{|types| types.uniq}.flatten.each{|f|
+      format_count[f] ||= 0
+      format_count[f] += 1
+    }
+    return nil if format_count.values.empty?
+    format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
+  end
   def self.id_position(supported_ids, id_name, options = {})
     pos = 0
     supported_ids.each_with_index{|id, i|

data/lib/rbbt/sources/polysearch.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'rbbt'
 require 'rbbt/util/open'
 require 'rbbt/ner/regexpNER'
+require 'rbbt/ner/dictionaryNER'
 # Find terms in the Polysearch thesauri using simple regular expression
 # matching. Note that the first time the methods are used the correspondent
@@ -11,13 +12,14 @@ module Polysearch
   @@names = {}
   def self.type_names(type) #:nodoc:
-    @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
+    @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
   end
   @@indexes = {}
   def self.type_index(type) #:nodoc:
-    @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
+    @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'))
+    #@@indexes[type] ||= DictionaryNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
   end
   # Find matches in a string of text, the types array specifies which thesauri
@@ -32,7 +34,7 @@ module Polysearch
     matches = {}
     types.collect{|type|
-      matches.merge!(type_index(type).match_hash(text))
+      matches.merge!(type_index(type).match(text))
     }
     matches
@@ -45,18 +47,17 @@ module Polysearch
 end
-if __FILE__ == $0
+if __FILE__ == $0
     text =<<-EOT
      Background  Microorganisms adapt their transcriptome by integrating
      multiple chemical and physical signals from their environment. Shake-flask
-    cultivation does not allow precise manipulation of individual culture
-    parameters and therefore precludes a quantitative analysis of the
-    (combinatorial) influence of these parameters on transcriptional
-    regulation. Steady-state chemostat cultures, which do enable accurate
-    control, measurement and manipulation of individual cultivation parameters
-    (e.g. specific growth rate, temperature, identity of the growth-limiting
+     cultivation does not allow precise manipulation of individual culture
+     parameters and therefore precludes a quantitative analysis of the
+     (combinatorial) influence of these parameters on transcriptional
+     regulation. Steady-state chemostat cultures, which do enable accurate
+     control, measurement and manipulation of individual cultivation parameters
+     (e.g. specific growth rate, temperature, identity of the growth-limiting
      nutrient) appear to provide a promising experimental platform for such a
      combinatorial analysis. Results  A microarray compendium of 170
      steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
@@ -76,13 +77,31 @@ if __FILE__ == $0
      combinatorial effects of environmental parameters on the transcriptome is
      crucial for understanding transcriptional regulation. Chemostat
      cultivation offers a powerful tool for such an approach. Keywords:
-       chemostat steady state samples
-    Cerebellar
-    stroke syndrome
+     chemostat steady state samples Cerebellar stroke syndrome
     EOT
-    p Polysearch.match(text,'disease').values.flatten
+    require 'benchmark'
+    require 'ruby-prof'
+    puts Benchmark.measure{
+      p Polysearch.match(text,'disease')
+    }
+    RubyProf.start
+    Polysearch.match(text,'disease')
+    result = RubyProf.stop
+    # Print a flat profile to text
+    printer = RubyProf::FlatPrinter.new(result)
+    printer.print(STDOUT, 0)
+    puts Benchmark.measure{
+      10.times{ p Polysearch.match(text,'disease') }
+    }
 end

data/lib/rbbt/util/filecache.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module FileCache
       raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
     end
     if filename !~ /.+\..+/
-      raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
+      raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
     end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rbbt
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.1.0
 platform: ruby
 authors:
 - Miguel Vazquez
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-11-10 00:00:00 +01:00
+date: 2009-12-02 00:00:00 +01:00
 default_executable: rbbt_config
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -99,7 +99,6 @@ files:
 - install_scripts/organisms/sgd.Rakefile
 - install_scripts/organisms/tair.Rakefile
 - install_scripts/organisms/worm.Rakefile
-- install_scripts/stopwords
 - install_scripts/wordlists/consonants
 - install_scripts/wordlists/stopwords
 - lib/rbbt.rb
@@ -108,6 +107,7 @@ files:
 - lib/rbbt/bow/dictionary.rb
 - lib/rbbt/ner/abner.rb
 - lib/rbbt/ner/banner.rb
+- lib/rbbt/ner/dictionaryNER.rb
 - lib/rbbt/ner/regexpNER.rb
 - lib/rbbt/ner/rner.rb
 - lib/rbbt/ner/rnorm.rb

data/install_scripts/stopwords DELETED Viewed

@@ -1 +0,0 @@

- a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where