rbbt 1.0.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -96,7 +96,7 @@ module Open
96
96
  else
97
97
  [id, name, extra].join("\t") + "\n"
98
98
  end
99
- }
99
+ }.join("\n")
100
100
  elsif url =~ /gene_ids/
101
101
  return content.gsub(/,/,"\t")
102
102
  else
@@ -0,0 +1,98 @@
1
+ # This class loads a dictionary of codes with associated names, it then can
2
+ # find those names in a string of text. It works word-wise.
3
+ class DictionaryNER
4
+
5
+ A_INT = "a"[0]
6
+ DOWNCASE_OFFSET = "A"[0] - "a"[0]
7
+
8
+ require 'rbbt/bow/bow'
9
+ # Divides a string of text into words. A slash separates words, only if the
10
+ # second one begins with a letter.
11
+ def self.chunk(text)
12
+ text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
13
+ end
14
+
15
+ # Simplify the text to widen the matches. Currently only downcases the keys
16
+ def self.simplify(text)
17
+ if text.length > 2 && text[0] < A_INT && text[1] > A_INT
18
+ text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
19
+ else
20
+ return text
21
+ end
22
+ end
23
+
24
+ # Given a dictionary structure, find the matches in the text.
25
+ def self.match(dict, text) #:nodoc:
26
+
27
+ if Array === text
28
+ words = text
29
+ else
30
+ words = chunk(text)
31
+ end
32
+
33
+ result = {}
34
+ words.each_with_index{|word, pos|
35
+ key = simplify(word)
36
+ next if dict[key].nil?
37
+ dict[key].each{|entrie|
38
+ case
39
+ when String === entrie
40
+ result[word] ||= []
41
+ result[word] << entrie unless result[word].include? entrie
42
+ when Hash === entrie
43
+ rec_words = words[(pos + 1)..-1]
44
+ rec_result = match(entrie, rec_words)
45
+ rec_result.each{|rec_key, rec_list|
46
+ composite_key = word + ' ' + rec_key
47
+ result[composite_key] ||= []
48
+ result[composite_key] += rec_list
49
+ result[composite_key].uniq!
50
+ }
51
+ end
52
+ }
53
+ }
54
+ result
55
+ end
56
+
57
+ # Add a name to a structure
58
+ def self.add_name(dict, name, code)
59
+ if Array === name
60
+ words = name
61
+ else
62
+ words = chunk(name)
63
+ end
64
+
65
+ key = simplify(words.shift)
66
+ if words.empty?
67
+ dict[key] ||= []
68
+ dict[key] << code unless dict[key].include? code
69
+ else
70
+ rec_dict = {}
71
+ add_name(rec_dict, words , code)
72
+ dict[key] ||= []
73
+ dict[key] << rec_dict
74
+ end
75
+ end
76
+
77
+ def self.load(dictionary)
78
+ dict = {}
79
+
80
+ dictionary = File.open(dictionary).read if File.exists? dictionary
81
+
82
+ dictionary.each_line{|l|
83
+ names = l.chomp.split(/\t/)
84
+ code = names.shift
85
+ names.each{|name| add_name(dict, name, code) }
86
+ }
87
+ dict
88
+ end
89
+
90
+ def initialize(dictionary)
91
+ @dict = DictionaryNER.load(dictionary)
92
+ end
93
+
94
+ def match(text)
95
+ DictionaryNER.match(@dict, text)
96
+ end
97
+
98
+ end
@@ -7,27 +7,33 @@ class RegExpNER
7
7
  res = [res] unless Array === res
8
8
 
9
9
  res.collect{|re|
10
- if text.match(re)
11
- $1
12
- else
13
- nil
14
- end
15
- }.compact
10
+ text.scan(re)
11
+ }.flatten
16
12
  end
17
13
 
18
- def self.build_re(names, ignorecase=true)
14
+ def self.build_re_old(names, ignorecase=true)
19
15
  names.compact.select{|n| n != ""}.
20
16
  sort{|a,b| b.length <=> a.length}.
21
17
  collect{|n|
22
18
  re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
23
- /(?:^|[^\w])(#{ re })(?:$|[^\w])/i
24
19
  }
25
20
  end
26
21
 
22
+ def self.build_re(names, ignorecase=true)
23
+ res = names.compact.select{|n| n != ""}.
24
+ sort{|a,b| b.length <=> a.length}.
25
+ collect{|n|
26
+ Regexp.quote(n)
27
+ }
28
+
29
+ /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
30
+ end
31
+
32
+
27
33
  def initialize(lexicon, options = {})
28
- options[:flatten] = true
34
+ options[:flatten] = true
29
35
  options[:ignorecase] = true if options[:ignorecase].nil?
30
- options[:stopwords] = true if options[:stopwords].nil?
36
+ options[:stopwords] = true if options[:stopwords].nil?
31
37
 
32
38
  data = Open.to_hash(lexicon, options)
33
39
 
@@ -55,7 +61,7 @@ class RegExpNER
55
61
  end
56
62
 
57
63
  def match(text)
58
- match_hash(text).values.flatten
64
+ match_hash(text)
59
65
  end
60
66
 
61
67
  end
@@ -2,13 +2,18 @@ require 'rbbt'
2
2
  require 'rbbt/util/open'
3
3
  require 'rbbt/util/index'
4
4
 
5
-
5
+ # This module contains some Organism centric functionalities. Each organism is
6
+ # identified by a keyword.
6
7
  module Organism
7
8
 
9
+ # Raised when trying to access information for an organism that has not been
10
+ # prepared already.
8
11
  class OrganismNotProcessedError < StandardError; end
9
12
 
10
- def self.all(installed = true)
11
- if installed
13
+ # Return the list of all supported organisms. The prepared flag is used to
14
+ # show only those that have been prepared.
15
+ def self.all(prepared = true)
16
+ if prepared
12
17
  Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
13
18
  else
14
19
  Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
@@ -16,72 +21,32 @@ module Organism
16
21
  end
17
22
 
18
23
 
24
+ # Return the complete name of an organism. The org parameter is the organism
25
+ # keyword
19
26
  def self.name(org)
20
27
  raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
21
28
  Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
22
29
  end
23
30
 
31
+ # Hash linking all the organism log names with their keywords in Rbbt. Its
32
+ # the inverse of the name method.
24
33
  NAME2ORG = {}
25
34
  Organism::all.each{|org|
26
35
  name = Organism.name(org).strip.downcase
27
36
  NAME2ORG[name] = org
28
37
  }
29
38
 
39
+
40
+ # Return the key word associated with an organism.
30
41
  def self.name2org(name)
31
42
  NAME2ORG[name.strip.downcase]
32
43
  end
33
44
 
34
- def self.id_formats(org)
35
- id_types = {}
36
- formats = supported_ids(org)
37
-
38
- text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
39
-
40
- if text.respond_to? :collect
41
- lines = text.collect
42
- else
43
- lines = text.lines
44
- end
45
-
46
- lines.each{|l|
47
- ids_per_type = l.split(/\t/)
48
- formats.zip(ids_per_type).each{|p|
49
- format = p[0]
50
- ids = p[1].split(/\|/)
51
- ids.each{|id|
52
- next if id.nil? || id == ""
53
- id_types[id.downcase] ||= []
54
- id_types[id.downcase] << format unless id_types[id.downcase].include? format
55
- }
56
- }
57
- }
58
-
59
- return id_types
60
- end
61
-
62
- def self.guessIdFormat(formats, query)
63
- query = query.compact.collect{|gene| gene.downcase}.uniq
64
- if String === formats
65
- formats = id_formats(formats)
66
- end
67
-
68
- return nil if formats.values.empty?
69
- values = formats.values_at(*query)
70
- return nil if values.empty?
71
-
72
- format_count = {}
73
- values.compact.collect{|types| types.uniq}.flatten.each{|f|
74
- format_count[f] ||= 0
75
- format_count[f] += 1
76
- }
77
-
78
- return nil if format_count.values.empty?
79
- format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
80
- end
81
-
82
45
  # FIXME: The NER related stuff is harder to install, thats why we hide the
83
46
  # requires next to where they are needed, next to options
84
47
 
48
+ # Return a NER object which could be of RNER, Abner or Banner class, this is
49
+ # selected using the type parameter.
85
50
  def self.ner(org, type=:rner, options = {})
86
51
 
87
52
  case type.to_sym
@@ -103,6 +68,7 @@ module Organism
103
68
 
104
69
  end
105
70
 
71
+ # Return a normalization object.
106
72
  def self.norm(org, to_entrez = nil)
107
73
  require 'rbbt/ner/rnorm'
108
74
  if to_entrez.nil?
@@ -117,11 +83,15 @@ module Organism
117
83
  Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
118
84
  end
119
85
 
86
+ # Returns a hash with the names associated with each gene id. The ids are
87
+ # in Rbbt native format for that organism.
120
88
  def self.lexicon(org, options = {})
121
- options[:sep] = "\t|\\|" unless options[:sep]
89
+ options = {:sep => "\t|\\|", :flatten => true}.merge(options)
122
90
  Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
123
91
  end
124
92
 
93
+ # Returns a hash with the list of go terms for each gene id. Gene ids are in
94
+ # Rbbt native format for that organism.
125
95
  def self.goterms(org)
126
96
  goterms = {}
127
97
  Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
@@ -132,18 +102,29 @@ module Organism
132
102
  goterms
133
103
  end
134
104
 
105
+ # Return list of PubMed ids associated to the organism. Determined using a
106
+ # PubMed query with the name of the organism
135
107
  def self.literature(org)
136
108
  Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
137
109
  end
138
110
 
111
+ # Return hash that associates genes to a list of PubMed ids.
139
112
  def self.gene_literature(org)
140
113
  Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
141
114
  end
142
115
 
116
+ # Return hash that associates genes to a list of PubMed ids. Includes only
117
+ # those found to support GO term associations.
143
118
  def self.gene_literature_go(org)
144
119
  Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
145
120
  end
146
121
 
122
+ # Returns a list with the names of the id formats supported for an organism.
123
+ # If examples are produced, the list is of [format, example] pairs.
124
+ #
125
+ # *Options:*
126
+ #
127
+ # *examples:* Include example ids for each format
147
128
  def self.supported_ids(org, options = {})
148
129
  formats = []
149
130
  examples = [] if options[:examples]
@@ -166,6 +147,57 @@ module Organism
166
147
  formats.zip(examples)
167
148
  end
168
149
 
150
+ # Creates a hash where each possible id is associated with the names of the
151
+ # formats (its potentially possible for different formats to have the same
152
+ # id). This is used in the guessIdFormat method.
153
+ def self.id_formats(org)
154
+ id_types = {}
155
+ formats = supported_ids(org)
156
+
157
+ text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
158
+
159
+ if text.respond_to? :collect
160
+ lines = text.collect
161
+ else
162
+ lines = text.lines
163
+ end
164
+
165
+ lines.each{|l|
166
+ ids_per_type = l.split(/\t/)
167
+ formats.zip(ids_per_type).each{|p|
168
+ format = p[0]
169
+ ids = p[1].split(/\|/)
170
+ ids.each{|id|
171
+ next if id.nil? || id == ""
172
+ id_types[id.downcase] ||= []
173
+ id_types[id.downcase] << format unless id_types[id.downcase].include? format
174
+ }
175
+ }
176
+ }
177
+
178
+ return id_types
179
+ end
180
+
181
+ def self.guessIdFormat(formats, query)
182
+ query = query.compact.collect{|gene| gene.downcase}.uniq
183
+ if String === formats
184
+ formats = id_formats(formats)
185
+ end
186
+
187
+ return nil if formats.values.empty?
188
+ values = formats.values_at(*query)
189
+ return nil if values.empty?
190
+
191
+ format_count = {}
192
+ values.compact.collect{|types| types.uniq}.flatten.each{|f|
193
+ format_count[f] ||= 0
194
+ format_count[f] += 1
195
+ }
196
+
197
+ return nil if format_count.values.empty?
198
+ format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
199
+ end
200
+
169
201
  def self.id_position(supported_ids, id_name, options = {})
170
202
  pos = 0
171
203
  supported_ids.each_with_index{|id, i|
@@ -1,6 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/open'
3
3
  require 'rbbt/ner/regexpNER'
4
+ require 'rbbt/ner/dictionaryNER'
4
5
 
5
6
  # Find terms in the Polysearch thesauri using simple regular expression
6
7
  # matching. Note that the first time the methods are used the correspondent
@@ -11,13 +12,14 @@ module Polysearch
11
12
 
12
13
  @@names = {}
13
14
  def self.type_names(type) #:nodoc:
14
- @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
15
+ @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
15
16
  end
16
17
 
17
18
 
18
19
  @@indexes = {}
19
20
  def self.type_index(type) #:nodoc:
20
- @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
21
+ @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'))
22
+ #@@indexes[type] ||= DictionaryNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
21
23
  end
22
24
 
23
25
  # Find matches in a string of text, the types array specifies which thesauri
@@ -32,7 +34,7 @@ module Polysearch
32
34
 
33
35
  matches = {}
34
36
  types.collect{|type|
35
- matches.merge!(type_index(type).match_hash(text))
37
+ matches.merge!(type_index(type).match(text))
36
38
  }
37
39
 
38
40
  matches
@@ -45,18 +47,17 @@ module Polysearch
45
47
 
46
48
  end
47
49
 
48
- if __FILE__ == $0
49
-
50
+ if __FILE__ == $0
50
51
  text =<<-EOT
51
52
 
52
53
  Background Microorganisms adapt their transcriptome by integrating
53
54
  multiple chemical and physical signals from their environment. Shake-flask
54
- cultivation does not allow precise manipulation of individual culture
55
- parameters and therefore precludes a quantitative analysis of the
56
- (combinatorial) influence of these parameters on transcriptional
57
- regulation. Steady-state chemostat cultures, which do enable accurate
58
- control, measurement and manipulation of individual cultivation parameters
59
- (e.g. specific growth rate, temperature, identity of the growth-limiting
55
+ cultivation does not allow precise manipulation of individual culture
56
+ parameters and therefore precludes a quantitative analysis of the
57
+ (combinatorial) influence of these parameters on transcriptional
58
+ regulation. Steady-state chemostat cultures, which do enable accurate
59
+ control, measurement and manipulation of individual cultivation parameters
60
+ (e.g. specific growth rate, temperature, identity of the growth-limiting
60
61
  nutrient) appear to provide a promising experimental platform for such a
61
62
  combinatorial analysis. Results A microarray compendium of 170
62
63
  steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
@@ -76,13 +77,31 @@ if __FILE__ == $0
76
77
  combinatorial effects of environmental parameters on the transcriptome is
77
78
  crucial for understanding transcriptional regulation. Chemostat
78
79
  cultivation offers a powerful tool for such an approach. Keywords:
79
- chemostat steady state samples
80
- Cerebellar
81
- stroke syndrome
80
+ chemostat steady state samples Cerebellar stroke syndrome
82
81
 
83
82
 
84
83
  EOT
85
84
 
86
- p Polysearch.match(text,'disease').values.flatten
85
+ require 'benchmark'
86
+ require 'ruby-prof'
87
+
88
+ puts Benchmark.measure{
89
+ p Polysearch.match(text,'disease')
90
+ }
91
+
92
+
93
+ RubyProf.start
94
+
95
+ Polysearch.match(text,'disease')
96
+
97
+ result = RubyProf.stop
98
+
99
+ # Print a flat profile to text
100
+ printer = RubyProf::FlatPrinter.new(result)
101
+ printer.print(STDOUT, 0)
102
+
103
+ puts Benchmark.measure{
104
+ 10.times{ p Polysearch.match(text,'disease') }
105
+ }
87
106
 
88
107
  end
@@ -20,7 +20,7 @@ module FileCache
20
20
  raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
21
  end
22
22
  if filename !~ /.+\..+/
23
- raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
23
+ raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
24
24
  end
25
25
  end
26
26
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-10 00:00:00 +01:00
12
+ date: 2009-12-02 00:00:00 +01:00
13
13
  default_executable: rbbt_config
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -99,7 +99,6 @@ files:
99
99
  - install_scripts/organisms/sgd.Rakefile
100
100
  - install_scripts/organisms/tair.Rakefile
101
101
  - install_scripts/organisms/worm.Rakefile
102
- - install_scripts/stopwords
103
102
  - install_scripts/wordlists/consonants
104
103
  - install_scripts/wordlists/stopwords
105
104
  - lib/rbbt.rb
@@ -108,6 +107,7 @@ files:
108
107
  - lib/rbbt/bow/dictionary.rb
109
108
  - lib/rbbt/ner/abner.rb
110
109
  - lib/rbbt/ner/banner.rb
110
+ - lib/rbbt/ner/dictionaryNER.rb
111
111
  - lib/rbbt/ner/regexpNER.rb
112
112
  - lib/rbbt/ner/rner.rb
113
113
  - lib/rbbt/ner/rnorm.rb
@@ -1 +0,0 @@
1
- a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where