rbbt 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/install_scripts/organisms/worm.Rakefile +1 -1
- data/lib/rbbt/ner/dictionaryNER.rb +98 -0
- data/lib/rbbt/ner/regexpNER.rb +17 -11
- data/lib/rbbt/sources/organism.rb +84 -52
- data/lib/rbbt/sources/polysearch.rb +34 -15
- data/lib/rbbt/util/filecache.rb +1 -1
- metadata +3 -3
- data/install_scripts/stopwords +0 -1
| @@ -0,0 +1,98 @@ | |
| 1 | 
            +
            # This class loads a dictionary of codes with associated names, it then can
         | 
| 2 | 
            +
            # find those names in a string of text. It works word-wise.
         | 
| 3 | 
            +
            class DictionaryNER
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              A_INT   = "a"[0]
         | 
| 6 | 
            +
              DOWNCASE_OFFSET = "A"[0] - "a"[0]
         | 
| 7 | 
            +
             | 
| 8 | 
            +
              require 'rbbt/bow/bow'
         | 
| 9 | 
            +
              # Divides a string of text into words. A slash separates words, only if the
         | 
| 10 | 
            +
              # second one begins with a letter.
         | 
| 11 | 
            +
              def self.chunk(text)
         | 
| 12 | 
            +
                text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
         | 
| 13 | 
            +
              end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              # Simplify the text to widen the matches. Currently only downcases the keys
         | 
| 16 | 
            +
              def self.simplify(text)
         | 
| 17 | 
            +
                if text.length > 2 && text[0] < A_INT && text[1] > A_INT
         | 
| 18 | 
            +
                  text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1] 
         | 
| 19 | 
            +
                else
         | 
| 20 | 
            +
                  return text
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
              # Given a dictionary structure, find the matches in the text.
         | 
| 25 | 
            +
              def self.match(dict, text) #:nodoc:
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                if Array === text
         | 
| 28 | 
            +
                  words = text
         | 
| 29 | 
            +
                else
         | 
| 30 | 
            +
                  words = chunk(text) 
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                result = {}
         | 
| 34 | 
            +
                words.each_with_index{|word, pos|
         | 
| 35 | 
            +
                  key = simplify(word)
         | 
| 36 | 
            +
                  next if dict[key].nil?
         | 
| 37 | 
            +
                  dict[key].each{|entrie|
         | 
| 38 | 
            +
                    case
         | 
| 39 | 
            +
                    when String === entrie
         | 
| 40 | 
            +
                      result[word] ||= []
         | 
| 41 | 
            +
                      result[word] << entrie unless result[word].include? entrie
         | 
| 42 | 
            +
                    when Hash === entrie
         | 
| 43 | 
            +
                      rec_words  = words[(pos + 1)..-1]
         | 
| 44 | 
            +
                      rec_result = match(entrie, rec_words)
         | 
| 45 | 
            +
                      rec_result.each{|rec_key, rec_list|
         | 
| 46 | 
            +
                        composite_key = word + ' ' + rec_key
         | 
| 47 | 
            +
                        result[composite_key] ||= []
         | 
| 48 | 
            +
                        result[composite_key] += rec_list
         | 
| 49 | 
            +
                        result[composite_key].uniq!
         | 
| 50 | 
            +
                      }
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
                  }
         | 
| 53 | 
            +
                }
         | 
| 54 | 
            +
                result
         | 
| 55 | 
            +
              end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
              # Add a name to a structure
         | 
| 58 | 
            +
              def self.add_name(dict, name, code)
         | 
| 59 | 
            +
                if Array === name
         | 
| 60 | 
            +
                  words = name
         | 
| 61 | 
            +
                else
         | 
| 62 | 
            +
                  words = chunk(name) 
         | 
| 63 | 
            +
                end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                key = simplify(words.shift)
         | 
| 66 | 
            +
                if words.empty?
         | 
| 67 | 
            +
                  dict[key] ||= []
         | 
| 68 | 
            +
                  dict[key] << code unless dict[key].include? code
         | 
| 69 | 
            +
                else
         | 
| 70 | 
            +
                  rec_dict = {}
         | 
| 71 | 
            +
                  add_name(rec_dict, words , code)
         | 
| 72 | 
            +
                  dict[key] ||= []
         | 
| 73 | 
            +
                  dict[key] << rec_dict
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              def self.load(dictionary)
         | 
| 78 | 
            +
                dict = {}
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                dictionary = File.open(dictionary).read if File.exists? dictionary
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                dictionary.each_line{|l|
         | 
| 83 | 
            +
                  names = l.chomp.split(/\t/)
         | 
| 84 | 
            +
                  code  = names.shift
         | 
| 85 | 
            +
                  names.each{|name| add_name(dict, name, code) }
         | 
| 86 | 
            +
                }
         | 
| 87 | 
            +
                dict
         | 
| 88 | 
            +
              end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              def initialize(dictionary)
         | 
| 91 | 
            +
                @dict = DictionaryNER.load(dictionary)
         | 
| 92 | 
            +
              end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
              def match(text)
         | 
| 95 | 
            +
                DictionaryNER.match(@dict, text)
         | 
| 96 | 
            +
              end
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            end
         | 
    
        data/lib/rbbt/ner/regexpNER.rb
    CHANGED
    
    | @@ -7,27 +7,33 @@ class RegExpNER | |
| 7 7 | 
             
                res = [res] unless Array === res
         | 
| 8 8 |  | 
| 9 9 | 
             
                res.collect{|re|
         | 
| 10 | 
            -
                   | 
| 11 | 
            -
             | 
| 12 | 
            -
                  else
         | 
| 13 | 
            -
                    nil
         | 
| 14 | 
            -
                  end
         | 
| 15 | 
            -
                }.compact
         | 
| 10 | 
            +
                  text.scan(re) 
         | 
| 11 | 
            +
                }.flatten
         | 
| 16 12 | 
             
              end
         | 
| 17 13 |  | 
| 18 | 
            -
              def self. | 
| 14 | 
            +
              def self.build_re_old(names, ignorecase=true)
         | 
| 19 15 | 
             
                names.compact.select{|n| n != ""}.
         | 
| 20 16 | 
             
                  sort{|a,b| b.length <=> a.length}.
         | 
| 21 17 | 
             
                  collect{|n| 
         | 
| 22 18 | 
             
                    re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
         | 
| 23 | 
            -
                    /(?:^|[^\w])(#{ re })(?:$|[^\w])/i
         | 
| 24 19 | 
             
                  }
         | 
| 25 20 | 
             
              end
         | 
| 26 21 |  | 
| 22 | 
            +
              def self.build_re(names, ignorecase=true)
         | 
| 23 | 
            +
                res = names.compact.select{|n| n != ""}.
         | 
| 24 | 
            +
                  sort{|a,b| b.length <=> a.length}.
         | 
| 25 | 
            +
                  collect{|n| 
         | 
| 26 | 
            +
                    Regexp.quote(n)
         | 
| 27 | 
            +
                  }
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 27 33 | 
             
              def initialize(lexicon, options = {})
         | 
| 28 | 
            -
                options[:flatten] | 
| 34 | 
            +
                options[:flatten]    = true
         | 
| 29 35 | 
             
                options[:ignorecase] = true if options[:ignorecase].nil?
         | 
| 30 | 
            -
                options[:stopwords] | 
| 36 | 
            +
                options[:stopwords]  = true if options[:stopwords].nil?
         | 
| 31 37 |  | 
| 32 38 | 
             
                data = Open.to_hash(lexicon, options)
         | 
| 33 39 |  | 
| @@ -55,7 +61,7 @@ class RegExpNER | |
| 55 61 | 
             
              end
         | 
| 56 62 |  | 
| 57 63 | 
             
              def match(text)
         | 
| 58 | 
            -
                match_hash(text) | 
| 64 | 
            +
                match_hash(text)
         | 
| 59 65 | 
             
              end
         | 
| 60 66 |  | 
| 61 67 | 
             
            end
         | 
| @@ -2,13 +2,18 @@ require 'rbbt' | |
| 2 2 | 
             
            require 'rbbt/util/open'
         | 
| 3 3 | 
             
            require 'rbbt/util/index'
         | 
| 4 4 |  | 
| 5 | 
            -
             | 
| 5 | 
            +
            # This module contains some Organism centric functionalities. Each organism is
         | 
| 6 | 
            +
            # identified by a keyword.
         | 
| 6 7 | 
             
            module Organism
         | 
| 7 8 |  | 
| 9 | 
            +
              # Raised when trying to access information for an organism that has not been
         | 
| 10 | 
            +
              # prepared already.
         | 
| 8 11 | 
             
              class OrganismNotProcessedError < StandardError; end
         | 
| 9 12 |  | 
| 10 | 
            -
               | 
| 11 | 
            -
             | 
| 13 | 
            +
              # Return the list of all supported organisms. The prepared flag is used to
         | 
| 14 | 
            +
              # show only those that have been prepared.
         | 
| 15 | 
            +
              def self.all(prepared = true)
         | 
| 16 | 
            +
                if prepared
         | 
| 12 17 | 
             
                  Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
         | 
| 13 18 | 
             
                else
         | 
| 14 19 | 
             
                  Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
         | 
| @@ -16,72 +21,32 @@ module Organism | |
| 16 21 | 
             
              end
         | 
| 17 22 |  | 
| 18 23 |  | 
| 24 | 
            +
              # Return the complete name of an organism. The org parameter is the organism
         | 
| 25 | 
            +
              # keyword
         | 
| 19 26 | 
             
              def self.name(org)
         | 
| 20 27 | 
             
                raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
         | 
| 21 28 | 
             
                Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
         | 
| 22 29 | 
             
              end
         | 
| 23 30 |  | 
| 31 | 
            +
              # Hash linking all the organism log names with their keywords in Rbbt. Its
         | 
| 32 | 
            +
              # the inverse of the name method.
         | 
| 24 33 | 
             
              NAME2ORG = {}
         | 
| 25 34 | 
             
              Organism::all.each{|org|  
         | 
| 26 35 | 
             
                name = Organism.name(org).strip.downcase
         | 
| 27 36 | 
             
                NAME2ORG[name] = org
         | 
| 28 37 | 
             
              }
         | 
| 29 38 |  | 
| 39 | 
            +
             | 
| 40 | 
            +
              # Return the key word associated with an organism.
         | 
| 30 41 | 
             
              def self.name2org(name)  
         | 
| 31 42 | 
             
                NAME2ORG[name.strip.downcase]
         | 
| 32 43 | 
             
              end
         | 
| 33 44 |  | 
| 34 | 
            -
              def self.id_formats(org)
         | 
| 35 | 
            -
                id_types = {}
         | 
| 36 | 
            -
                formats = supported_ids(org)
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
         | 
| 39 | 
            -
                
         | 
| 40 | 
            -
                if text.respond_to? :collect
         | 
| 41 | 
            -
                  lines = text.collect
         | 
| 42 | 
            -
                else
         | 
| 43 | 
            -
                  lines = text.lines
         | 
| 44 | 
            -
                end
         | 
| 45 | 
            -
             | 
| 46 | 
            -
                lines.each{|l|
         | 
| 47 | 
            -
                  ids_per_type = l.split(/\t/)
         | 
| 48 | 
            -
                  formats.zip(ids_per_type).each{|p|
         | 
| 49 | 
            -
                    format = p[0]
         | 
| 50 | 
            -
                    ids = p[1].split(/\|/)
         | 
| 51 | 
            -
                    ids.each{|id|
         | 
| 52 | 
            -
                      next if id.nil? || id == ""
         | 
| 53 | 
            -
                      id_types[id.downcase] ||= []
         | 
| 54 | 
            -
                      id_types[id.downcase] << format unless id_types[id.downcase].include? format
         | 
| 55 | 
            -
                    }
         | 
| 56 | 
            -
                  }
         | 
| 57 | 
            -
                }
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                return id_types
         | 
| 60 | 
            -
              end
         | 
| 61 | 
            -
             | 
| 62 | 
            -
              def self.guessIdFormat(formats, query)
         | 
| 63 | 
            -
                query = query.compact.collect{|gene| gene.downcase}.uniq
         | 
| 64 | 
            -
                if String === formats
         | 
| 65 | 
            -
                  formats = id_formats(formats)
         | 
| 66 | 
            -
                end
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                return nil if formats.values.empty?
         | 
| 69 | 
            -
                values = formats.values_at(*query)
         | 
| 70 | 
            -
                return nil if values.empty?
         | 
| 71 | 
            -
                
         | 
| 72 | 
            -
                format_count = {}
         | 
| 73 | 
            -
                values.compact.collect{|types| types.uniq}.flatten.each{|f| 
         | 
| 74 | 
            -
                  format_count[f] ||= 0
         | 
| 75 | 
            -
                  format_count[f] += 1
         | 
| 76 | 
            -
                }
         | 
| 77 | 
            -
                
         | 
| 78 | 
            -
                return nil if format_count.values.empty?
         | 
| 79 | 
            -
                format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
         | 
| 80 | 
            -
              end
         | 
| 81 | 
            -
             | 
| 82 45 | 
             
              # FIXME: The NER related stuff is harder to install, thats why we hide the
         | 
| 83 46 | 
             
              # requires next to where they are needed, next to options
         | 
| 84 47 |  | 
| 48 | 
            +
              # Return a NER object which could be of RNER, Abner or Banner class, this is
         | 
| 49 | 
            +
              # selected using the type parameter. 
         | 
| 85 50 | 
             
              def self.ner(org, type=:rner, options = {})
         | 
| 86 51 |  | 
| 87 52 | 
             
                case type.to_sym
         | 
| @@ -103,6 +68,7 @@ module Organism | |
| 103 68 |  | 
| 104 69 | 
             
              end
         | 
| 105 70 |  | 
| 71 | 
            +
              # Return a normalization object.
         | 
| 106 72 | 
             
              def self.norm(org, to_entrez = nil)
         | 
| 107 73 | 
             
                require 'rbbt/ner/rnorm'
         | 
| 108 74 | 
             
                if to_entrez.nil?
         | 
| @@ -117,11 +83,15 @@ module Organism | |
| 117 83 | 
             
                Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
         | 
| 118 84 | 
             
              end
         | 
| 119 85 |  | 
| 86 | 
            +
              # Returns a hash with the names associated with each gene id. The ids are
         | 
| 87 | 
            +
              # in Rbbt native format for that organism.
         | 
| 120 88 | 
             
              def self.lexicon(org, options = {})
         | 
| 121 | 
            -
                options | 
| 89 | 
            +
                options = {:sep => "\t|\\|", :flatten => true}.merge(options)
         | 
| 122 90 | 
             
                Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
         | 
| 123 91 | 
             
              end
         | 
| 124 92 |  | 
| 93 | 
            +
              # Returns a hash with the list of go terms for each gene id. Gene ids are in
         | 
| 94 | 
            +
              # Rbbt native format for that organism.
         | 
| 125 95 | 
             
              def self.goterms(org)
         | 
| 126 96 | 
             
                goterms = {}
         | 
| 127 97 | 
             
                Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
         | 
| @@ -132,18 +102,29 @@ module Organism | |
| 132 102 | 
             
                goterms
         | 
| 133 103 | 
             
              end
         | 
| 134 104 |  | 
| 105 | 
            +
              # Return list of PubMed ids associated to the organism. Determined using a
         | 
| 106 | 
            +
              # PubMed query with the name of the organism
         | 
| 135 107 | 
             
              def self.literature(org)
         | 
| 136 108 | 
             
                Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
         | 
| 137 109 | 
             
              end
         | 
| 138 110 |  | 
| 111 | 
            +
              # Return hash that associates genes to a list of PubMed ids.
         | 
| 139 112 | 
             
              def self.gene_literature(org)
         | 
| 140 113 | 
             
                Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
         | 
| 141 114 | 
             
              end
         | 
| 142 115 |  | 
| 116 | 
            +
              # Return hash that associates genes to a list of PubMed ids. Includes only
         | 
| 117 | 
            +
              # those found to support GO term associations.
         | 
| 143 118 | 
             
              def self.gene_literature_go(org)
         | 
| 144 119 | 
             
                Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
         | 
| 145 120 | 
             
              end
         | 
| 146 121 |  | 
| 122 | 
            +
              # Returns a list with the names of the id formats supported for an organism.
         | 
| 123 | 
            +
              # If examples are produced, the list is of [format, example] pairs.
         | 
| 124 | 
            +
              # 
         | 
| 125 | 
            +
              # *Options:*
         | 
| 126 | 
            +
              #
         | 
| 127 | 
            +
              # *examples:* Include example ids for each format
         | 
| 147 128 | 
             
              def self.supported_ids(org, options = {})
         | 
| 148 129 | 
             
                formats  = []
         | 
| 149 130 | 
             
                examples = [] if options[:examples]
         | 
| @@ -166,6 +147,57 @@ module Organism | |
| 166 147 | 
             
                formats.zip(examples)
         | 
| 167 148 | 
             
              end
         | 
| 168 149 |  | 
| 150 | 
            +
              # Creates a hash where each possible id is associated with the names of the
         | 
| 151 | 
            +
              # formats (its potentially possible for different formats to have the same
         | 
| 152 | 
            +
              # id). This is used in the guessIdFormat method. 
         | 
| 153 | 
            +
              def self.id_formats(org) 
         | 
| 154 | 
            +
                id_types = {} 
         | 
| 155 | 
            +
                formats = supported_ids(org)
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
         | 
| 158 | 
            +
                
         | 
| 159 | 
            +
                if text.respond_to? :collect
         | 
| 160 | 
            +
                  lines = text.collect
         | 
| 161 | 
            +
                else
         | 
| 162 | 
            +
                  lines = text.lines
         | 
| 163 | 
            +
                end
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                lines.each{|l|
         | 
| 166 | 
            +
                  ids_per_type = l.split(/\t/)
         | 
| 167 | 
            +
                  formats.zip(ids_per_type).each{|p|
         | 
| 168 | 
            +
                    format = p[0]
         | 
| 169 | 
            +
                    ids = p[1].split(/\|/)
         | 
| 170 | 
            +
                    ids.each{|id|
         | 
| 171 | 
            +
                      next if id.nil? || id == ""
         | 
| 172 | 
            +
                      id_types[id.downcase] ||= []
         | 
| 173 | 
            +
                      id_types[id.downcase] << format unless id_types[id.downcase].include? format
         | 
| 174 | 
            +
                    }
         | 
| 175 | 
            +
                  }
         | 
| 176 | 
            +
                }
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                return id_types
         | 
| 179 | 
            +
              end
         | 
| 180 | 
            +
             | 
| 181 | 
            +
              def self.guessIdFormat(formats, query)
         | 
| 182 | 
            +
                query = query.compact.collect{|gene| gene.downcase}.uniq
         | 
| 183 | 
            +
                if String === formats
         | 
| 184 | 
            +
                  formats = id_formats(formats)
         | 
| 185 | 
            +
                end
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                return nil if formats.values.empty?
         | 
| 188 | 
            +
                values = formats.values_at(*query)
         | 
| 189 | 
            +
                return nil if values.empty?
         | 
| 190 | 
            +
                
         | 
| 191 | 
            +
                format_count = {}
         | 
| 192 | 
            +
                values.compact.collect{|types| types.uniq}.flatten.each{|f| 
         | 
| 193 | 
            +
                  format_count[f] ||= 0
         | 
| 194 | 
            +
                  format_count[f] += 1
         | 
| 195 | 
            +
                }
         | 
| 196 | 
            +
                
         | 
| 197 | 
            +
                return nil if format_count.values.empty?
         | 
| 198 | 
            +
                format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
         | 
| 199 | 
            +
              end
         | 
| 200 | 
            +
             | 
| 169 201 | 
             
              def self.id_position(supported_ids, id_name, options = {})
         | 
| 170 202 | 
             
                pos = 0
         | 
| 171 203 | 
             
                supported_ids.each_with_index{|id, i| 
         | 
| @@ -1,6 +1,7 @@ | |
| 1 1 | 
             
            require 'rbbt'
         | 
| 2 2 | 
             
            require 'rbbt/util/open'
         | 
| 3 3 | 
             
            require 'rbbt/ner/regexpNER'
         | 
| 4 | 
            +
            require 'rbbt/ner/dictionaryNER'
         | 
| 4 5 |  | 
| 5 6 | 
             
            # Find terms in the Polysearch thesauri using simple regular expression
         | 
| 6 7 | 
             
            # matching. Note that the first time the methods are used the correspondent
         | 
| @@ -11,13 +12,14 @@ module Polysearch | |
| 11 12 |  | 
| 12 13 | 
             
              @@names = {}
         | 
| 13 14 | 
             
              def self.type_names(type) #:nodoc:
         | 
| 14 | 
            -
                @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
         | 
| 15 | 
            +
                @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
         | 
| 15 16 | 
             
              end
         | 
| 16 17 |  | 
| 17 18 |  | 
| 18 19 | 
             
              @@indexes = {}
         | 
| 19 20 | 
             
              def self.type_index(type) #:nodoc:
         | 
| 20 | 
            -
                @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
         | 
| 21 | 
            +
                @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'))
         | 
| 22 | 
            +
                #@@indexes[type] ||= DictionaryNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
         | 
| 21 23 | 
             
              end
         | 
| 22 24 |  | 
| 23 25 | 
             
              # Find matches in a string of text, the types array specifies which thesauri
         | 
| @@ -32,7 +34,7 @@ module Polysearch | |
| 32 34 |  | 
| 33 35 | 
             
                matches = {}
         | 
| 34 36 | 
             
                types.collect{|type|
         | 
| 35 | 
            -
                  matches.merge!(type_index(type). | 
| 37 | 
            +
                  matches.merge!(type_index(type).match(text))
         | 
| 36 38 | 
             
                }
         | 
| 37 39 |  | 
| 38 40 | 
             
                matches
         | 
| @@ -45,18 +47,17 @@ module Polysearch | |
| 45 47 |  | 
| 46 48 | 
             
            end
         | 
| 47 49 |  | 
| 48 | 
            -
            if __FILE__ == $0 | 
| 49 | 
            -
             | 
| 50 | 
            +
            if __FILE__ == $0
         | 
| 50 51 | 
             
                text =<<-EOT
         | 
| 51 52 |  | 
| 52 53 | 
             
                 Background  Microorganisms adapt their transcriptome by integrating
         | 
| 53 54 | 
             
                 multiple chemical and physical signals from their environment. Shake-flask
         | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 55 | 
            +
                 cultivation does not allow precise manipulation of individual culture
         | 
| 56 | 
            +
                 parameters and therefore precludes a quantitative analysis of the
         | 
| 57 | 
            +
                 (combinatorial) influence of these parameters on transcriptional
         | 
| 58 | 
            +
                 regulation. Steady-state chemostat cultures, which do enable accurate
         | 
| 59 | 
            +
                 control, measurement and manipulation of individual cultivation parameters
         | 
| 60 | 
            +
                 (e.g. specific growth rate, temperature, identity of the growth-limiting
         | 
| 60 61 | 
             
                 nutrient) appear to provide a promising experimental platform for such a
         | 
| 61 62 | 
             
                 combinatorial analysis. Results  A microarray compendium of 170
         | 
| 62 63 | 
             
                 steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
         | 
| @@ -76,13 +77,31 @@ if __FILE__ == $0 | |
| 76 77 | 
             
                 combinatorial effects of environmental parameters on the transcriptome is
         | 
| 77 78 | 
             
                 crucial for understanding transcriptional regulation. Chemostat
         | 
| 78 79 | 
             
                 cultivation offers a powerful tool for such an approach. Keywords:
         | 
| 79 | 
            -
             | 
| 80 | 
            -
                Cerebellar 
         | 
| 81 | 
            -
                stroke syndrome
         | 
| 80 | 
            +
                 chemostat steady state samples Cerebellar stroke syndrome
         | 
| 82 81 |  | 
| 83 82 |  | 
| 84 83 | 
             
                EOT
         | 
| 85 84 |  | 
| 86 | 
            -
                 | 
| 85 | 
            +
                require 'benchmark'
         | 
| 86 | 
            +
                require 'ruby-prof'
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                puts Benchmark.measure{
         | 
| 89 | 
            +
                  p Polysearch.match(text,'disease')
         | 
| 90 | 
            +
                }
         | 
| 91 | 
            +
             | 
| 92 | 
            +
             | 
| 93 | 
            +
                RubyProf.start
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                Polysearch.match(text,'disease')
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                result = RubyProf.stop
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                # Print a flat profile to text
         | 
| 100 | 
            +
                printer = RubyProf::FlatPrinter.new(result)
         | 
| 101 | 
            +
                printer.print(STDOUT, 0)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                puts Benchmark.measure{
         | 
| 104 | 
            +
                  10.times{ p Polysearch.match(text,'disease') }
         | 
| 105 | 
            +
                }
         | 
| 87 106 |  | 
| 88 107 | 
             
            end
         | 
    
        data/lib/rbbt/util/filecache.rb
    CHANGED
    
    | @@ -20,7 +20,7 @@ module FileCache | |
| 20 20 | 
             
                  raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
         | 
| 21 21 | 
             
                end
         | 
| 22 22 | 
             
                if filename !~ /.+\..+/
         | 
| 23 | 
            -
                  raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
         | 
| 23 | 
            +
                  raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
         | 
| 24 24 | 
             
                end
         | 
| 25 25 | 
             
              end
         | 
| 26 26 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: rbbt
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 1.0 | 
| 4 | 
            +
              version: 1.1.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - Miguel Vazquez
         | 
| @@ -9,7 +9,7 @@ autorequire: | |
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 11 |  | 
| 12 | 
            -
            date: 2009- | 
| 12 | 
            +
            date: 2009-12-02 00:00:00 +01:00
         | 
| 13 13 | 
             
            default_executable: rbbt_config
         | 
| 14 14 | 
             
            dependencies: 
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -99,7 +99,6 @@ files: | |
| 99 99 | 
             
            - install_scripts/organisms/sgd.Rakefile
         | 
| 100 100 | 
             
            - install_scripts/organisms/tair.Rakefile
         | 
| 101 101 | 
             
            - install_scripts/organisms/worm.Rakefile
         | 
| 102 | 
            -
            - install_scripts/stopwords
         | 
| 103 102 | 
             
            - install_scripts/wordlists/consonants
         | 
| 104 103 | 
             
            - install_scripts/wordlists/stopwords
         | 
| 105 104 | 
             
            - lib/rbbt.rb
         | 
| @@ -108,6 +107,7 @@ files: | |
| 108 107 | 
             
            - lib/rbbt/bow/dictionary.rb
         | 
| 109 108 | 
             
            - lib/rbbt/ner/abner.rb
         | 
| 110 109 | 
             
            - lib/rbbt/ner/banner.rb
         | 
| 110 | 
            +
            - lib/rbbt/ner/dictionaryNER.rb
         | 
| 111 111 | 
             
            - lib/rbbt/ner/regexpNER.rb
         | 
| 112 112 | 
             
            - lib/rbbt/ner/rner.rb
         | 
| 113 113 | 
             
            - lib/rbbt/ner/rnorm.rb
         | 
    
        data/install_scripts/stopwords
    DELETED
    
    | @@ -1 +0,0 @@ | |
| 1 | 
            -
            a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where 
         |