rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,117 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/ner/regexpNER'
4
- require 'rbbt/ner/dictionaryNER'
5
-
6
- # Find terms in the Polysearch thesauri using simple regular expression
7
- # matching. Note that the first time the methods are used the correspondent
8
- # thesaurus are loaded into memory. The available thesauri are: disease, drug,
9
- # metabolite, organ, subcellular (subcellular localization) and tissue.
10
- module Polysearch
11
-
12
-
13
- @@names = {}
14
- def self.type_names(type) #:nodoc:
15
- @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
16
- end
17
-
18
-
19
- @@indexes = {}
20
- def self.type_index(type) #:nodoc:
21
- if $stopwords
22
- stopwords = $stopwords
23
- else
24
- stopwords = []
25
- end
26
-
27
- case type.to_sym
28
- when :disease
29
- stopwords << 'use'
30
- end
31
-
32
- @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
33
- end
34
-
35
- # Find matches in a string of text, the types array specifies which thesauri
36
- # to use, if if nil it will use all.
37
- def self.match(text, types = nil)
38
- if types.nil?
39
- types = %w(disease drug metabolite organ subcellular tissue)
40
- end
41
-
42
- types = [types] unless Array === types
43
- types = types.sort
44
-
45
- matches = {}
46
- types.collect{|type|
47
- matches.merge!(type_index(type).match(text))
48
- }
49
-
50
- matches
51
- end
52
-
53
- # Transform the code into a name, type is the thesaurus to use
54
- def self.name(type, code)
55
- type_names(type)[code]
56
- end
57
-
58
- end
59
-
60
- if __FILE__ == $0
61
- text =<<-EOT
62
-
63
- Background Microorganisms adapt their transcriptome by integrating
64
- multiple chemical and physical signals from their environment. Shake-flask
65
- cultivation does not allow precise manipulation of individual culture
66
- parameters and therefore precludes a quantitative analysis of the
67
- (combinatorial) influence of these parameters on transcriptional
68
- regulation. Steady-state chemostat cultures, which do enable accurate
69
- control, measurement and manipulation of individual cultivation parameters
70
- (e.g. specific growth rate, temperature, identity of the growth-limiting
71
- nutrient) appear to provide a promising experimental platform for such a
72
- combinatorial analysis. Results A microarray compendium of 170
73
- steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
74
- presented and analyzed. The 170 microarrays encompass 55 unique
75
- conditions, which can be characterized by the combined settings of 10
76
- different cultivation parameters. By applying a regression model to assess
77
- the impact of (combinations of) cultivation parameters on the
78
- transcriptome, most S. cerevisiae genes were shown to be influenced by
79
- multiple cultivation parameters, and in many cases by combinatorial
80
- effects of cultivation parameters. The inclusion of these combinatorial
81
- effects in the regression model led to higher explained variance of the
82
- gene expression patterns and resulted in higher function enrichment in
83
- subsequent analysis. We further demonstrate the usefulness of the
84
- compendium and regression analysis for interpretation of shake-flask-based
85
- transcriptome studies and for guiding functional analysis of
86
- (uncharacterized) genes and pathways. Conclusions Modeling the
87
- combinatorial effects of environmental parameters on the transcriptome is
88
- crucial for understanding transcriptional regulation. Chemostat
89
- cultivation offers a powerful tool for such an approach. Keywords:
90
- chemostat steady state samples Cerebellar stroke syndrome
91
-
92
-
93
- EOT
94
-
95
- require 'benchmark'
96
- require 'ruby-prof'
97
-
98
- puts Benchmark.measure{
99
- p Polysearch.match(text,'disease')
100
- }
101
-
102
-
103
- RubyProf.start
104
-
105
- Polysearch.match(text,'disease')
106
-
107
- result = RubyProf.stop
108
-
109
- # Print a flat profile to text
110
- printer = RubyProf::FlatPrinter.new(result)
111
- printer.print(STDOUT, 0)
112
-
113
- puts Benchmark.measure{
114
- 10.times{ p Polysearch.match(text,'disease') }
115
- }
116
-
117
- end
@@ -1,111 +0,0 @@
1
- require 'rbbt/util/filecache'
2
- require 'rbbt/util/open'
3
- require 'rbbt'
4
-
5
- # This module offers an interface with PubMed, to perform queries, and
6
- # retrieve simple information from articles. It uses the caching
7
- # services of Rbbt.
8
- module PubMed
9
-
10
- private
11
- @@last = Time.now
12
- @@pubmed_lag = 1
13
- def self.get_online(pmids)
14
-
15
- pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
16
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
17
-
18
- diff = Time.now - @@last
19
- sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
20
-
21
- xml = Open.read(url, :quiet => true, :nocache => true)
22
-
23
- @@last = Time.now
24
-
25
- articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
26
-
27
- if pmids.is_a? Array
28
- list = {}
29
- articles.each{|article|
30
- pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
31
- list[pmid] = article
32
- }
33
- return list
34
- else
35
- return articles.first
36
- end
37
-
38
- end
39
-
40
- public
41
-
42
- # Processes the xml with an articles as served by MedLine and extracts
43
- # the abstract, title and journal information
44
- class Article
45
- attr_reader :title, :abstract, :journal
46
- def initialize(xml)
47
- xml ||= ""
48
- @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
- @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
- @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
51
- end
52
-
53
- # Join the text from title and abstract
54
- def text
55
- [@title, @abstract].join("\n")
56
- end
57
- end
58
-
59
- # Returns the Article object containing the information for the PubMed
60
- # ID specified as an argument. If +pmid+ is an array instead of a single
61
- # identifier it returns an hash with the Article object for each id.
62
- # It uses the Rbbt cache to save the articles xml.
63
- def self.get_article(pmid)
64
-
65
- if pmid.is_a? Array
66
- missing = []
67
- list = {}
68
-
69
- pmid.each{|p|
70
- filename = p.to_s + '.xml'
71
- if File.exists? FileCache.path(filename)
72
- list[p] = Article.new(Open.read(FileCache.path(filename)))
73
- else
74
- missing << p
75
- end
76
- }
77
-
78
- return list unless missing.any?
79
- articles = get_online(missing)
80
-
81
- articles.each{|p, xml|
82
- filename = p + '.xml'
83
- FileCache.add_file(filename,xml, :force => true)
84
- list[p] = Article.new(xml)
85
- }
86
-
87
- return list
88
-
89
- else
90
- filename = pmid.to_s + '.xml'
91
-
92
- if File.exists? FileCache.path(filename)
93
- return Article.new(Open.read(FileCache.path(filename)))
94
- else
95
- xml = get_online(pmid)
96
- FileCache.add_file(filename,xml)
97
-
98
- return Article.new(xml)
99
- end
100
- end
101
- end
102
-
103
- # Performs the specified query and returns an array with the PubMed
104
- # Ids returned. +retmax+ can be used to limit the number of ids
105
- # returned, if is not specified 30000 is used.
106
- def self.query(query, retmax=nil)
107
- retmax ||= 30000
108
-
109
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
110
- end
111
- end
@@ -1,255 +0,0 @@
1
-
2
- class ArrayHash
3
-
4
- # Take two strings of elements separated by the character sep_char and join them
5
- # into one, removing repetitions.
6
- def self.merge_values_string(list1, list2, sep_char ='|')
7
- elem1 = list1.to_s.split(sep_char)
8
- elem2 = list2.to_s.split(sep_char)
9
- (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
10
- end
11
-
12
- # Merge two lists of elements. Elements could be strings of elements
13
- # separated by the character sep_char, or arrays of lists of such strings.
14
- def self.merge_values(list1, list2, sep_char = "|")
15
- if String === list1 || String === list2
16
- return merge_values_string(list1, list2)
17
- end
18
-
19
- if list1.nil?
20
- list1 = [''] * list2.length
21
- end
22
-
23
- if list2.nil?
24
- list2 = [''] * list1.length
25
- end
26
-
27
- new = []
28
- list1.each_with_index{|elem, i|
29
- new << merge_values_string(elem, list2[i], sep_char)
30
- }
31
- new
32
- end
33
-
34
-
35
- # Take an hash of arrays and a position and use the value at that position
36
- # of the arrays and build a new hash with that value as key, and the original
37
- # key prepended to the arrays. The options hash appcepts the following keys
38
- # :case_insensitive, which defaults to true, and :index, which indicates that
39
- # the original key should be the value of the hash entry, instead of the
40
- # complete array of values.
41
- def self.pullout(hash, pos, options = {})
42
- index = options[:index]; index = false if index.nil?
43
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
44
-
45
- new = {}
46
- hash.each{|key,values|
47
- code = values[pos].to_s
48
- next if code == ""
49
-
50
- if index
51
- list = key
52
- else
53
- list = [key] + values
54
- list.delete_at(pos + 1)
55
- end
56
-
57
- code.split("|").each{|c|
58
- c = c.downcase if case_insensitive
59
- new[c] = merge_values(new[c], list)
60
- }
61
- }
62
-
63
- if case_insensitive
64
- class << new; self; end.instance_eval{
65
- alias_method :old_get, :[]
66
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
67
- }
68
- end
69
-
70
- new
71
- end
72
-
73
- # Merge to hashes of arrays. Each hash contains a number of fields for each
74
- # entry. The pos1 and pos2 indicate what fields should be used to match
75
- # entries, the values for pos1 and pos2 can be an integer indicating the
76
- # position in the array or the symbol :main to refer to the key of the hash.
77
- # The options hash accepts the key :case_insensitive, which defaults to true.
78
- def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
79
-
80
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
81
- if pos1.to_s.downcase != 'main'
82
- index1 = pullout(hash1, pos1, options.merge(:index => true))
83
- elsif options[:case_insensitive]
84
- new = {}
85
- hash1.each{|k,v|
86
- new[k.to_s.downcase] = v
87
- }
88
- class << new; self; end.instance_eval{
89
- alias_method :old_get, :[]
90
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
91
- }
92
- hash1 = new
93
- end
94
-
95
- length1 = hash1.values.first.length
96
- length2 = hash2.values.first.length
97
-
98
- new = {}
99
- hash2.each{|key, values|
100
- case
101
- when pos2.to_s.downcase == 'main'
102
- k = key
103
- v = values
104
- when Fixnum === pos2
105
- k = values[pos2]
106
- v = values
107
- v.delete_at(pos2)
108
- v.unshift(key)
109
- else
110
- raise "Format of second index not understood"
111
- end
112
-
113
- code = (index1.nil? ? k : index1[k])
114
- if code
115
- code.split('|').each{|c|
116
- c = c.to_s.downcase if options[:case_insensitive]
117
- new[c] = hash1[c] || [''] * length1
118
- new[c] += v
119
- }
120
- end
121
- }
122
-
123
- hash1.each{|key, values|
124
- new[key] ||= values + [''] * length2
125
- }
126
-
127
- new
128
- end
129
-
130
- # For a given hash of arrays, filter the position pos of each array with the
131
- # block of code.
132
- def self.process(hash, pos, &block)
133
- new = {}
134
- hash.each{|key, values|
135
- v = values
136
- v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
137
- new[key] = v
138
- }
139
- new
140
- end
141
-
142
- # Clean structure for repeated values. If the same value apear two times use
143
- # eliminate the one that appears latter on the values list (columns of the
144
- # ArrayHash are assumed to be sorted for importance) if the appear on the
145
- # same position, remove the one with the smaller vale of the code after
146
- # turning it into integer.
147
- def self.clean(hash, options = {})
148
- case_sensitive = options[:case_sensitive]
149
-
150
- found = {}
151
-
152
- hash.each{|k, list|
153
- list.each_with_index{|values,i|
154
- (String === values ? values.split("|") : values).each{|v|
155
- v = v.downcase if case_sensitive
156
- if found[v].nil?
157
- found[v] = [k,i]
158
- else
159
- last_k, last_i = found[v].values_at(0,1)
160
- if last_i > i || (last_i == i && last_k.to_i > k.to_i)
161
- found[v] = [k,i]
162
- end
163
- end
164
- }
165
- }
166
- }
167
-
168
- new_hash = {}
169
- hash.each{|k,list|
170
- new_list = []
171
- list.each_with_index{|values,i|
172
- new_values = []
173
- (String === values ? values.split("|") : values).each{|v|
174
- found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
175
- if found_i == i && found_k == k
176
- new_values << v
177
- end
178
- }
179
- new_list << (String === values ? new_values.join("|") : values)
180
- }
181
- new_hash[k] = new_list
182
- }
183
- new_hash
184
- end
185
-
186
- attr_reader :main, :fields, :data
187
- def initialize(hash, main, fields = nil)
188
- @data = hash
189
- @main = main.to_s
190
-
191
- if fields.nil?
192
- l = hash.values.first.length
193
- fields = []
194
- l.times{|i| fields << "F#{i}"}
195
- end
196
-
197
- @fields = fields.collect{|f| f.to_s}
198
- end
199
-
200
- # Wrapper
201
- def process(field, &block)
202
- pos = self.field_pos(field)
203
- @data = ArrayHash.process(self.data, pos, &block)
204
- self
205
- end
206
-
207
- # Returns the position of a given field in the value arrays
208
- def field_pos(field)
209
- return :main if field == :main
210
- if field.downcase == self.main.downcase
211
- return :main
212
- else
213
- @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
214
- end
215
- end
216
-
217
-
218
- # Merge two ArrayHashes using the specified field
219
- def merge(other, field = :main, options = {} )
220
- field = self.main if field == :main
221
-
222
- pos1 = self.field_pos(field)
223
- pos2 = other.field_pos(field)
224
-
225
- new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
226
- @data = new
227
- if pos2 == :main
228
- new_fields = other.fields
229
- else
230
- new_fields = other.fields
231
- new_fields.delete_at(pos2)
232
- new_fields.unshift(other.main)
233
- end
234
- @fields += new_fields
235
- self
236
- end
237
-
238
- # Remove a field from the ArrayHash
239
- def remove(field)
240
- pos = self.field_pos(field)
241
- return if pos.nil?
242
- @data = self.data.each{|key,values| values.delete_at(pos)}
243
- @fields.delete_at(pos)
244
- self
245
- end
246
-
247
- def clean
248
- @data = ArrayHash.clean(@data)
249
- self
250
- end
251
- end
252
-
253
-
254
-
255
-