rbbt 1.1.7 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,117 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/ner/regexpNER'
4
- require 'rbbt/ner/dictionaryNER'
5
-
6
- # Find terms in the Polysearch thesauri using simple regular expression
7
- # matching. Note that the first time the methods are used the correspondent
8
- # thesaurus are loaded into memory. The available thesauri are: disease, drug,
9
- # metabolite, organ, subcellular (subcellular localization) and tissue.
10
- module Polysearch
11
-
12
-
13
- @@names = {}
14
- def self.type_names(type) #:nodoc:
15
- @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
16
- end
17
-
18
-
19
- @@indexes = {}
20
- def self.type_index(type) #:nodoc:
21
- if $stopwords
22
- stopwords = $stopwords
23
- else
24
- stopwords = []
25
- end
26
-
27
- case type.to_sym
28
- when :disease
29
- stopwords << 'use'
30
- end
31
-
32
- @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
33
- end
34
-
35
- # Find matches in a string of text, the types array specifies which thesauri
36
- # to use, if if nil it will use all.
37
- def self.match(text, types = nil)
38
- if types.nil?
39
- types = %w(disease drug metabolite organ subcellular tissue)
40
- end
41
-
42
- types = [types] unless Array === types
43
- types = types.sort
44
-
45
- matches = {}
46
- types.collect{|type|
47
- matches.merge!(type_index(type).match(text))
48
- }
49
-
50
- matches
51
- end
52
-
53
- # Transform the code into a name, type is the thesaurus to use
54
- def self.name(type, code)
55
- type_names(type)[code]
56
- end
57
-
58
- end
59
-
60
- if __FILE__ == $0
61
- text =<<-EOT
62
-
63
- Background Microorganisms adapt their transcriptome by integrating
64
- multiple chemical and physical signals from their environment. Shake-flask
65
- cultivation does not allow precise manipulation of individual culture
66
- parameters and therefore precludes a quantitative analysis of the
67
- (combinatorial) influence of these parameters on transcriptional
68
- regulation. Steady-state chemostat cultures, which do enable accurate
69
- control, measurement and manipulation of individual cultivation parameters
70
- (e.g. specific growth rate, temperature, identity of the growth-limiting
71
- nutrient) appear to provide a promising experimental platform for such a
72
- combinatorial analysis. Results A microarray compendium of 170
73
- steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
74
- presented and analyzed. The 170 microarrays encompass 55 unique
75
- conditions, which can be characterized by the combined settings of 10
76
- different cultivation parameters. By applying a regression model to assess
77
- the impact of (combinations of) cultivation parameters on the
78
- transcriptome, most S. cerevisiae genes were shown to be influenced by
79
- multiple cultivation parameters, and in many cases by combinatorial
80
- effects of cultivation parameters. The inclusion of these combinatorial
81
- effects in the regression model led to higher explained variance of the
82
- gene expression patterns and resulted in higher function enrichment in
83
- subsequent analysis. We further demonstrate the usefulness of the
84
- compendium and regression analysis for interpretation of shake-flask-based
85
- transcriptome studies and for guiding functional analysis of
86
- (uncharacterized) genes and pathways. Conclusions Modeling the
87
- combinatorial effects of environmental parameters on the transcriptome is
88
- crucial for understanding transcriptional regulation. Chemostat
89
- cultivation offers a powerful tool for such an approach. Keywords:
90
- chemostat steady state samples Cerebellar stroke syndrome
91
-
92
-
93
- EOT
94
-
95
- require 'benchmark'
96
- require 'ruby-prof'
97
-
98
- puts Benchmark.measure{
99
- p Polysearch.match(text,'disease')
100
- }
101
-
102
-
103
- RubyProf.start
104
-
105
- Polysearch.match(text,'disease')
106
-
107
- result = RubyProf.stop
108
-
109
- # Print a flat profile to text
110
- printer = RubyProf::FlatPrinter.new(result)
111
- printer.print(STDOUT, 0)
112
-
113
- puts Benchmark.measure{
114
- 10.times{ p Polysearch.match(text,'disease') }
115
- }
116
-
117
- end
@@ -1,111 +0,0 @@
1
- require 'rbbt/util/filecache'
2
- require 'rbbt/util/open'
3
- require 'rbbt'
4
-
5
- # This module offers an interface with PubMed, to perform queries, and
6
- # retrieve simple information from articles. It uses the caching
7
- # services of Rbbt.
8
- module PubMed
9
-
10
- private
11
- @@last = Time.now
12
- @@pubmed_lag = 1
13
- def self.get_online(pmids)
14
-
15
- pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
16
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
17
-
18
- diff = Time.now - @@last
19
- sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
20
-
21
- xml = Open.read(url, :quiet => true, :nocache => true)
22
-
23
- @@last = Time.now
24
-
25
- articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
26
-
27
- if pmids.is_a? Array
28
- list = {}
29
- articles.each{|article|
30
- pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
31
- list[pmid] = article
32
- }
33
- return list
34
- else
35
- return articles.first
36
- end
37
-
38
- end
39
-
40
- public
41
-
42
- # Processes the xml with an articles as served by MedLine and extracts
43
- # the abstract, title and journal information
44
- class Article
45
- attr_reader :title, :abstract, :journal
46
- def initialize(xml)
47
- xml ||= ""
48
- @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
- @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
- @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
51
- end
52
-
53
- # Join the text from title and abstract
54
- def text
55
- [@title, @abstract].join("\n")
56
- end
57
- end
58
-
59
- # Returns the Article object containing the information for the PubMed
60
- # ID specified as an argument. If +pmid+ is an array instead of a single
61
- # identifier it returns an hash with the Article object for each id.
62
- # It uses the Rbbt cache to save the articles xml.
63
- def self.get_article(pmid)
64
-
65
- if pmid.is_a? Array
66
- missing = []
67
- list = {}
68
-
69
- pmid.each{|p|
70
- filename = p.to_s + '.xml'
71
- if File.exists? FileCache.path(filename)
72
- list[p] = Article.new(Open.read(FileCache.path(filename)))
73
- else
74
- missing << p
75
- end
76
- }
77
-
78
- return list unless missing.any?
79
- articles = get_online(missing)
80
-
81
- articles.each{|p, xml|
82
- filename = p + '.xml'
83
- FileCache.add_file(filename,xml, :force => true)
84
- list[p] = Article.new(xml)
85
- }
86
-
87
- return list
88
-
89
- else
90
- filename = pmid.to_s + '.xml'
91
-
92
- if File.exists? FileCache.path(filename)
93
- return Article.new(Open.read(FileCache.path(filename)))
94
- else
95
- xml = get_online(pmid)
96
- FileCache.add_file(filename,xml)
97
-
98
- return Article.new(xml)
99
- end
100
- end
101
- end
102
-
103
- # Performs the specified query and returns an array with the PubMed
104
- # Ids returned. +retmax+ can be used to limit the number of ids
105
- # returned, if is not specified 30000 is used.
106
- def self.query(query, retmax=nil)
107
- retmax ||= 30000
108
-
109
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
110
- end
111
- end
@@ -1,255 +0,0 @@
1
-
2
- class ArrayHash
3
-
4
- # Take two strings of elements separated by the character sep_char and join them
5
- # into one, removing repetitions.
6
- def self.merge_values_string(list1, list2, sep_char ='|')
7
- elem1 = list1.to_s.split(sep_char)
8
- elem2 = list2.to_s.split(sep_char)
9
- (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
10
- end
11
-
12
- # Merge two lists of elements. Elements could be strings of elements
13
- # separated by the character sep_char, or arrays of lists of such strings.
14
- def self.merge_values(list1, list2, sep_char = "|")
15
- if String === list1 || String === list2
16
- return merge_values_string(list1, list2)
17
- end
18
-
19
- if list1.nil?
20
- list1 = [''] * list2.length
21
- end
22
-
23
- if list2.nil?
24
- list2 = [''] * list1.length
25
- end
26
-
27
- new = []
28
- list1.each_with_index{|elem, i|
29
- new << merge_values_string(elem, list2[i], sep_char)
30
- }
31
- new
32
- end
33
-
34
-
35
- # Take an hash of arrays and a position and use the value at that position
36
- # of the arrays and build a new hash with that value as key, and the original
37
- # key prepended to the arrays. The options hash appcepts the following keys
38
- # :case_insensitive, which defaults to true, and :index, which indicates that
39
- # the original key should be the value of the hash entry, instead of the
40
- # complete array of values.
41
- def self.pullout(hash, pos, options = {})
42
- index = options[:index]; index = false if index.nil?
43
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
44
-
45
- new = {}
46
- hash.each{|key,values|
47
- code = values[pos].to_s
48
- next if code == ""
49
-
50
- if index
51
- list = key
52
- else
53
- list = [key] + values
54
- list.delete_at(pos + 1)
55
- end
56
-
57
- code.split("|").each{|c|
58
- c = c.downcase if case_insensitive
59
- new[c] = merge_values(new[c], list)
60
- }
61
- }
62
-
63
- if case_insensitive
64
- class << new; self; end.instance_eval{
65
- alias_method :old_get, :[]
66
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
67
- }
68
- end
69
-
70
- new
71
- end
72
-
73
- # Merge to hashes of arrays. Each hash contains a number of fields for each
74
- # entry. The pos1 and pos2 indicate what fields should be used to match
75
- # entries, the values for pos1 and pos2 can be an integer indicating the
76
- # position in the array or the symbol :main to refer to the key of the hash.
77
- # The options hash accepts the key :case_insensitive, which defaults to true.
78
- def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
79
-
80
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
81
- if pos1.to_s.downcase != 'main'
82
- index1 = pullout(hash1, pos1, options.merge(:index => true))
83
- elsif options[:case_insensitive]
84
- new = {}
85
- hash1.each{|k,v|
86
- new[k.to_s.downcase] = v
87
- }
88
- class << new; self; end.instance_eval{
89
- alias_method :old_get, :[]
90
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
91
- }
92
- hash1 = new
93
- end
94
-
95
- length1 = hash1.values.first.length
96
- length2 = hash2.values.first.length
97
-
98
- new = {}
99
- hash2.each{|key, values|
100
- case
101
- when pos2.to_s.downcase == 'main'
102
- k = key
103
- v = values
104
- when Fixnum === pos2
105
- k = values[pos2]
106
- v = values
107
- v.delete_at(pos2)
108
- v.unshift(key)
109
- else
110
- raise "Format of second index not understood"
111
- end
112
-
113
- code = (index1.nil? ? k : index1[k])
114
- if code
115
- code.split('|').each{|c|
116
- c = c.to_s.downcase if options[:case_insensitive]
117
- new[c] = hash1[c] || [''] * length1
118
- new[c] += v
119
- }
120
- end
121
- }
122
-
123
- hash1.each{|key, values|
124
- new[key] ||= values + [''] * length2
125
- }
126
-
127
- new
128
- end
129
-
130
- # For a given hash of arrays, filter the position pos of each array with the
131
- # block of code.
132
- def self.process(hash, pos, &block)
133
- new = {}
134
- hash.each{|key, values|
135
- v = values
136
- v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
137
- new[key] = v
138
- }
139
- new
140
- end
141
-
142
- # Clean structure for repeated values. If the same value apear two times use
143
- # eliminate the one that appears latter on the values list (columns of the
144
- # ArrayHash are assumed to be sorted for importance) if the appear on the
145
- # same position, remove the one with the smaller vale of the code after
146
- # turning it into integer.
147
- def self.clean(hash, options = {})
148
- case_sensitive = options[:case_sensitive]
149
-
150
- found = {}
151
-
152
- hash.each{|k, list|
153
- list.each_with_index{|values,i|
154
- (String === values ? values.split("|") : values).each{|v|
155
- v = v.downcase if case_sensitive
156
- if found[v].nil?
157
- found[v] = [k,i]
158
- else
159
- last_k, last_i = found[v].values_at(0,1)
160
- if last_i > i || (last_i == i && last_k.to_i > k.to_i)
161
- found[v] = [k,i]
162
- end
163
- end
164
- }
165
- }
166
- }
167
-
168
- new_hash = {}
169
- hash.each{|k,list|
170
- new_list = []
171
- list.each_with_index{|values,i|
172
- new_values = []
173
- (String === values ? values.split("|") : values).each{|v|
174
- found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
175
- if found_i == i && found_k == k
176
- new_values << v
177
- end
178
- }
179
- new_list << (String === values ? new_values.join("|") : values)
180
- }
181
- new_hash[k] = new_list
182
- }
183
- new_hash
184
- end
185
-
186
- attr_reader :main, :fields, :data
187
- def initialize(hash, main, fields = nil)
188
- @data = hash
189
- @main = main.to_s
190
-
191
- if fields.nil?
192
- l = hash.values.first.length
193
- fields = []
194
- l.times{|i| fields << "F#{i}"}
195
- end
196
-
197
- @fields = fields.collect{|f| f.to_s}
198
- end
199
-
200
- # Wrapper
201
- def process(field, &block)
202
- pos = self.field_pos(field)
203
- @data = ArrayHash.process(self.data, pos, &block)
204
- self
205
- end
206
-
207
- # Returns the position of a given field in the value arrays
208
- def field_pos(field)
209
- return :main if field == :main
210
- if field.downcase == self.main.downcase
211
- return :main
212
- else
213
- @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
214
- end
215
- end
216
-
217
-
218
- # Merge two ArrayHashes using the specified field
219
- def merge(other, field = :main, options = {} )
220
- field = self.main if field == :main
221
-
222
- pos1 = self.field_pos(field)
223
- pos2 = other.field_pos(field)
224
-
225
- new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
226
- @data = new
227
- if pos2 == :main
228
- new_fields = other.fields
229
- else
230
- new_fields = other.fields
231
- new_fields.delete_at(pos2)
232
- new_fields.unshift(other.main)
233
- end
234
- @fields += new_fields
235
- self
236
- end
237
-
238
- # Remove a field from the ArrayHash
239
- def remove(field)
240
- pos = self.field_pos(field)
241
- return if pos.nil?
242
- @data = self.data.each{|key,values| values.delete_at(pos)}
243
- @fields.delete_at(pos)
244
- self
245
- end
246
-
247
- def clean
248
- @data = ArrayHash.clean(@data)
249
- self
250
- end
251
- end
252
-
253
-
254
-
255
-