RubyGems - rbbt - Versions diffs - 1.1.8 → 1.2.1 - Mend

rbbt 1.1.8 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README.rdoc +12 -12
data/bin/rbbt_config +2 -3
data/install_scripts/norm/Rakefile +4 -4
data/install_scripts/organisms/{tair.Rakefile → Ath.Rakefile} +4 -3
data/install_scripts/organisms/{cgd.Rakefile → Cal.Rakefile} +0 -0
data/install_scripts/organisms/{worm.Rakefile → Cel.Rakefile} +0 -0
data/install_scripts/organisms/{human.Rakefile → Hsa.Rakefile} +4 -8
data/install_scripts/organisms/{mgi.Rakefile → Mmu.Rakefile} +0 -0
data/install_scripts/organisms/{rgd.Rakefile → Rno.Rakefile} +0 -0
data/install_scripts/organisms/{sgd.Rakefile → Sce.Rakefile} +0 -0
data/install_scripts/organisms/{pombe.Rakefile → Spo.Rakefile} +0 -0
data/install_scripts/organisms/rake-include.rb +15 -19
data/lib/rbbt.rb +0 -3
data/lib/rbbt/ner/rnorm.rb +2 -2
data/lib/rbbt/sources/go.rb +48 -3
data/lib/rbbt/sources/organism.rb +12 -17
data/lib/rbbt/util/open.rb +27 -27
data/lib/rbbt/util/tmpfile.rb +16 -0
data/tasks/install.rake +1 -1
data/test/rbbt/bow/test_bow.rb +33 -0
data/test/rbbt/bow/test_classifier.rb +72 -0
data/test/rbbt/bow/test_dictionary.rb +91 -0
data/test/rbbt/ner/rnorm/test_cue_index.rb +57 -0
data/test/rbbt/ner/rnorm/test_tokens.rb +70 -0
data/test/rbbt/ner/test_abner.rb +17 -0
data/test/rbbt/ner/test_banner.rb +17 -0
data/test/rbbt/ner/test_dictionaryNER.rb +122 -0
data/test/rbbt/ner/test_regexpNER.rb +33 -0
data/test/rbbt/ner/test_rner.rb +126 -0
data/test/rbbt/ner/test_rnorm.rb +47 -0
data/test/rbbt/sources/test_biocreative.rb +38 -0
data/test/rbbt/sources/test_biomart.rb +31 -0
data/test/rbbt/sources/test_entrez.rb +49 -0
data/test/rbbt/sources/test_go.rb +24 -0
data/test/rbbt/sources/test_organism.rb +59 -0
data/test/rbbt/sources/test_polysearch.rb +27 -0
data/test/rbbt/sources/test_pubmed.rb +29 -0
data/test/rbbt/util/test_arrayHash.rb +257 -0
data/test/rbbt/util/test_filecache.rb +37 -0
data/test/rbbt/util/test_index.rb +31 -0
data/test/rbbt/util/test_misc.rb +20 -0
data/test/rbbt/util/test_open.rb +97 -0
data/test/rbbt/util/test_simpleDSL.rb +57 -0
data/test/rbbt/util/test_tmpfile.rb +21 -0
data/test/test_helper.rb +4 -0
data/test/test_rbbt.rb +11 -0
metadata +39 -12

data/README.rdoc CHANGED Viewed

@@ -57,14 +57,14 @@ Identifiers translation:: Translates gene identifiers between formats.
 Organisms in rbbt are identified using a keyword. This is the list of organisms currently supported with their associated keywords:
-Candida albicans:: cgd
-Mus musculus:: mgi
-Rattus norvegicus:: rgd
-Saccharomyces cerevisiae:: sgd
-Arabidopsis thaliana:: tair
-Caenorhabditis elegans:: worm
-Homo sapiens:: human
-Schizosaccharomyces pombe:: pombe
+Candida albicans:: Cal
+Mus musculus:: Mmu
+Rattus norvegicus:: Rno
+Saccharomyces cerevisiae:: Sce
+Arabidopsis thaliana:: Ata
+Caenorhabditis elegans:: Cel
+Homo sapiens:: Hsa
+Schizosaccharomyces pombe:: Spo
 === Other
@@ -80,11 +80,11 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
 === Using rbbt to translate identifiers
 1. Do <tt>rbbt_config prepare identifiers</tt> to do deploy the configuration files and download entrez data, this needs to be done just once.
-3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o sgd</tt> to process only yeast (sgd).
+3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o Sce</tt> to process only yeast (Sce).
 4. You may now use a script like this to translate gene identifiers from yeast feed from the standard input
   require 'rbbt/sources/organism'
-  index = Organism.id_index('sgd', :native => 'Entrez Gene Id')
+  index = Organism.id_index('Sce', :native => 'Entrez Gene Id')
   STDIN.each_line{|l| puts "#{l.chomp} => #{index[l.chomp]}"}
@@ -93,7 +93,7 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
 First prepare the organisms as you did in the previous section. Next, if you want to use the default NER module:
 1. Install the Biocreative data used to train the model and compile the CRF++ plugin, <tt>rbbt_config prepare rner</tt>. You may need at this point to install ParseTree and ruby2ruby
-2. Build the module for a particular organism <tt>rbbt_config install ner -o sgd</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
+2. Build the module for a particular organism <tt>rbbt_config install ner -o Sce</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
 Or, if you wan to use Abner or Banner:
@@ -108,7 +108,7 @@ You may now, for example, find mentions to genes in articles from a PubMed query
     # type = :banner
     type = :rner
-    ner = Organism.ner('sgd', type )
+    ner = Organism.ner('Sce', type )
     pmids = PubMed.query(ARGV[0], 500)
     PubMed.get_article(pmids).each{|pmid,article|

data/bin/rbbt_config CHANGED Viewed

@@ -1,5 +1,7 @@
 #!/usr/bin/ruby
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 require 'rubygems'
 require 'rake'
@@ -67,9 +69,6 @@ $USAGE =<<EOT
         descriptions, is not cleaned, as these are not likely to change
     * organisms: Show a list of all organisms along with their identifier in the system
 EOT
 class Controller < SimpleConsole::Controller

data/install_scripts/norm/Rakefile CHANGED Viewed

@@ -14,10 +14,10 @@ $docs  = ENV['docs']
 $org2rbbt = {
-  'yeast' => 'sgd',
-  'mouse' => 'mgi',
-  'fly' => 'sgd',
-  'bc2gn' => 'human',
+  'yeast' => 'Sce',
+  'mouse' => 'Mmu',
+  'fly' => 'Sce',
+  'bc2gn' => 'Hsa',
 }
 def match(org, filedir, goldstandard,outfile)

data/install_scripts/organisms/{tair.Rakefile → Ath.Rakefile} RENAMED Viewed

@@ -21,9 +21,10 @@ $lexicon = {
 $identifiers = {
   :file => {
-    :url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20090313",
-    :native => 0,
-    :extra => [],
+    :url => "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2009-7-29.txt",
+    :native => 4,
+    :extra => [0],
+    :fields => ["Affymetrix"],
   },
   :biomart => {
     :database => 'athaliana_eg_gene',

data/install_scripts/organisms/{cgd.Rakefile → Cal.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/{worm.Rakefile → Cel.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/{human.Rakefile → Hsa.Rakefile} RENAMED Viewed

@@ -86,7 +86,7 @@ Rake::Task['gene.go'].clear
 file 'gene.go' => ['identifiers'] do
   if File.exists? 'identifiers'
     require 'rbbt/sources/organism'
-    index = Organism.id_index('human', :other => ['Associated Gene Name'])
+    index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
     data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
     data = data.collect{|code, value_lists|
@@ -96,9 +96,7 @@ file 'gene.go' => ['identifiers'] do
     Open.write('gene.go',
                data.collect{|p|
-                 p[1].uniq.collect{|go|
-                   "#{p[0]}\t#{go}"
-                 }.join("\n")
+                 "#{p[0]}\t#{p[1].uniq.join("|")}"
                }.join("\n")
               )
   end
@@ -117,9 +115,7 @@ file 'gene_go.pmid' => ['identifiers'] do
     Open.write('gene_go.pmid',
                data.collect{|p|
-                 p[1].uniq.collect{|pmid|
-                   "#{p[0]}\t#{pmid}"
-                 }.join("\n")
+                 "#{p[0]}\t#{p[1].uniq.join("|")}"
                }.join("\n")
               )
   end
@@ -132,7 +128,7 @@ file 'lexicon' => ['identifiers'] do
     require 'rbbt/sources/organism'
     HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
     names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
-    translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
+    translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
     Open.write('lexicon',
                names.collect{|code, names|

data/install_scripts/organisms/{mgi.Rakefile → Mmu.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/{rgd.Rakefile → Rno.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/{sgd.Rakefile → Sce.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/{pombe.Rakefile → Spo.Rakefile} RENAMED Viewed

File without changes

data/install_scripts/organisms/rake-include.rb CHANGED Viewed

@@ -192,23 +192,18 @@ end
 file 'gene.go' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
+  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
-  data = data.collect{|code, value_lists|
-    [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
-  }.select{|p|  p[1].any?}
+  Open.write('gene.go', data.collect { |gene, values|
+    goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]}
+    goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
+  }.compact.join("\n"))
-  Open.write('gene.go',
-              data.collect{|p|
-                p[1].uniq.collect{|go|
-                  "#{p[0]}\t#{go}"
-                }.join("\n")
-              }.join("\n")
-            )
 end
 file 'gene_go.pmid' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
+  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
   data = data.collect{|code, value_lists|
     [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
@@ -216,8 +211,9 @@ file 'gene_go.pmid' do
   Open.write('gene_go.pmid',
               data.collect{|p|
-                p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
-              }.join("\n")
+                next if p[1].empty?
+                "#{p[0]}\t#{p[1].uniq.join("|")}"
+              }.compact.join("\n")
             )
 end
@@ -230,11 +226,9 @@ file 'gene.pmid' do
     Open.write('gene.pmid',
                data.collect{|code,pmids|
-      next if translations && ! translations[code]
-      code = translations[code].first if translations
-      pmids.collect{|pmid|
-                 "#{ code }\t#{pmid}"
-      }.compact.join("\n")
+                 next if translations && ! translations[code]
+                 code = translations[code].first if translations
+                 "#{code}\t#{pmids.uniq.join("|")}"
     }.compact.join("\n")
               )
   rescue Entrez::NoFileError
@@ -256,3 +250,5 @@ task 'update' do
   Rake::Task['all'].invoke
 end
+task 'default' => 'all'

data/lib/rbbt.rb CHANGED Viewed

@@ -1,6 +1,3 @@
-$:.unshift(File.dirname(__FILE__)) unless
-  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
 require 'fileutils'
 require 'yaml'

data/lib/rbbt/ner/rnorm.rb CHANGED Viewed

@@ -60,9 +60,9 @@ class Normalizer
       }
       # Get all at once, better performance
       genes = Entrez.get_gene(code2entrez.values)
-      code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
+      code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
       code2entrez_genes.collect{|p|
         [p[0], Entrez.gene_text_similarity(p[1], text)]

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -4,7 +4,9 @@ require 'rbbt'
 # This module holds helper methods to deal with the Gene Ontology files. Right
 # now all it does is provide a translation form id to the actual names.
 module GO
   @@info = nil
+  MULTIPLE_VALUE_FIELDS = %w(is_a)
   # This method needs to be called before any translations can be made, it is
   # called automatically the first time the id2name method is called. It loads
@@ -20,10 +22,25 @@ module GO
           select{|l| l =~ /:/}.
           each{|l|
             key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
-            term_info[key.strip] = value.strip
+            if MULTIPLE_VALUE_FIELDS.include? key.strip
+              term_info[key.strip] ||= []
+              term_info[key.strip] << value.strip
+            else
+              term_info[key.strip] = value.strip
+            end
           }
         @@info[term_info["id"]] = term_info
-      }
+    }
+  end
+  def self.info
+    self.init unless @@info
+    @@info
+  end
+  def self.goterms
+    self.init unless @@info
+    @@info.keys
   end
   def self.id2name(id)
@@ -31,10 +48,38 @@ module GO
     if id.kind_of? Array
       @@info.values_at(*id).collect{|i| i['name'] if i}
     else
-      return "Name not found" unless @@info[id]
+      return nil if @@info[id].nil?
       @@info[id]['name']
     end
   end
+  def self.id2ancestors(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).
+        select{|i| ! i['is_a'].nil?}.
+        collect{|i| i['is_a'].collect{|id|
+          id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+        }.compact
+      }
+    else
+      return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
+      @@info[id]['is_a'].
+        collect{|id|
+        id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+      }.compact
+    end
+  end
+  def self.id2namespace(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).collect{|i| i['namespace'] if i}
+    else
+      return nil if @@info[id].nil?
+      @@info[id]['namespace']
+    end
+  end
 end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -93,13 +93,7 @@ module Organism
   # Returns a hash with the list of go terms for each gene id. Gene ids are in
   # Rbbt native format for that organism.
   def self.goterms(org)
-    goterms = {}
-    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
-      gene, go = l.chomp.split(/\t/)
-      goterms[gene.strip] ||= []
-      goterms[gene.strip] << go.strip
-    }
-    goterms
+    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go"), :flatten => true)
   end
   # Return list of PubMed ids associated to the organism. Determined using a
@@ -209,33 +203,34 @@ module Organism
     pos
   end
-  def self.id_index(org, option = {})
-    native = option[:native]
-    other  = option[:other]
-    option[:case_sensitive] = false if option[:case_sensitive].nil?
+  def self.id_index(org, options = {})
+    native = options[:native]
+    other  = options[:other]
+    options[:case_sensitive] = false if options[:case_sensitive].nil?
     if native.nil? and other.nil?
-      Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
+      Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
     else
       supported = Organism.supported_ids(org)
       first = nil
       if native
-        first = id_position(supported,native,option)
+        first = id_position(supported,native,options)
       else
         first = 0
       end
       rest = nil
       if other
-        rest = other.collect{|name| id_position(supported,name, option)}
+        rest = other.collect{|name| id_position(supported,name, options)}
       else
         rest = (0..supported.length - 1).to_a - [first]
       end
-      option[:native] = first
-      option[:extra] = rest
-      index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
+      options[:native] = first
+      options[:extra] = rest
+      options[:sep] = "\t"
+      index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
       index
     end

data/lib/rbbt/util/open.rb CHANGED Viewed

@@ -171,16 +171,18 @@ module Open
   # * :native => position of the elements that will constitute the keys. By default 0.
   # * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
   # * :sep =>  pattern to use in splitting the lines into elements, by default "\t"
+  # * :sep2 =>  pattern to use in splitting the elements into subelements, by default "|"
   # * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
   # * :single => for each key select only the first of the values, instead of the complete array.
   # * :fix  => A Proc that is called to pre-process the line
   # * :exclude => A Proc that is called to check if the line must be excluded from the process.
-  def self.to_hash(filename, options = {})
+  def self.to_hash(input, options = {})
     native  = options[:native]  || 0
     extra   = options[:extra]
     exclude = options[:exclude]
     fix     = options[:fix]
     sep     = options[:sep]     || "\t"
+    sep2    = options[:sep2]    || "|"
     single  = options[:single]
     single  = false if single.nil?
     flatten = options[:flatten] || single
@@ -188,8 +190,14 @@ module Open
     extra = [extra] if extra && ! extra.is_a?( Array)
+    if StringIO === input
+      content = input
+    else
+      content = Open.read(input)
+    end
     data = {}
-    Open.read(filename).each_line{|l|
+    content.each_line{|l|
       l = fix.call(l) if fix
       next if exclude and exclude.call(l)
@@ -198,37 +206,29 @@ module Open
       next if id.nil? || id == ""
       data[id] ||= []
       if extra
-        fields = extra
+        row_fields = row_fields.values_at(*extra)
       else
-        fields = (0..(row_fields.length - 1)).to_a - [native]
+        row_fields.delete_at(native)
       end
-      fields.each_with_index{|pos,i|
-        data[id][i] ||= []
-        data[id][i] << row_fields[pos]
-      }
-    }
-    if flatten
-      data.each{|key, values|
-        if values
-          values.flatten!
-          values.collect!{|v|
-            if v != ""
-              v
-            else
-              nil
-            end
-          }
-          values.compact!
-        else
-          nil
-        end
-      }
-    end
+      if flatten
+        data[id] += row_fields.compact.collect{|v|
+          v.split(sep2)
+        }.flatten
+      else
+        row_fields.each_with_index{|value, i|
+          next if value.nil?
+          data[id][i] ||= []
+          data[id][i] += value.split(sep2)
+        }
+      end
+    }
     data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
     data
   end