rbbt-marq 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/R/GEO.R +6 -4
- data/R/MA.R +1 -0
- data/bin/marq_config +4 -3
- data/install_scripts/CustomDS/Rakefile +27 -215
- data/install_scripts/GEO/Rakefile +34 -275
- data/install_scripts/rake_includes.rb +236 -0
- data/lib/MARQ/CustomDS.rb +63 -32
- data/lib/MARQ/GEO.rb +99 -63
- data/lib/MARQ/MADB.rb +107 -202
- data/lib/MARQ/annotations.rb +124 -38
- data/lib/MARQ/main.rb +152 -160
- data/lib/MARQ/rankproduct.rb +20 -34
- data/tasks/install.rake +7 -2
- metadata +3 -2
data/lib/MARQ/MADB.rb
CHANGED
@@ -1,229 +1,133 @@
|
|
1
1
|
require 'MARQ'
|
2
2
|
require 'MARQ/GEO'
|
3
3
|
require 'MARQ/CustomDS'
|
4
|
+
require 'MARQ/main'
|
4
5
|
|
5
6
|
module MADB
|
6
|
-
module CustomDS
|
7
|
-
|
8
|
-
def self.save(dataset)
|
9
|
-
prefix = Object::CustomDS.path(dataset)
|
10
|
-
|
11
|
-
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
12
|
-
|
13
|
-
DBcache.save(dataset + '_codes', codes)
|
14
|
-
|
15
|
-
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
16
|
-
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
17
|
-
|
18
|
-
data = {}
|
19
|
-
codes.each_with_index{|code,i|
|
20
|
-
data[code.to_sym] = orders[i]
|
21
|
-
}
|
22
|
-
case
|
23
|
-
when codes.length < 65535
|
24
|
-
type = "SMALLINT UNSIGNED"
|
25
|
-
when codes.length < 16777215
|
26
|
-
type = "MEDIUMIN UNSIGNED"
|
27
|
-
else
|
28
|
-
type = "INT UNSIGNED"
|
29
|
-
end
|
30
|
-
|
31
|
-
DBcache.save(dataset + '_experiments', experiments)
|
32
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
33
|
-
|
34
|
-
return unless Object::CustomDS::has_cross_platform?(dataset)
|
35
|
-
dataset = dataset + '_cross_platform'
|
36
|
-
prefix = Object::CustomDS.path(dataset)
|
37
|
-
|
38
|
-
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
39
|
-
|
40
|
-
DBcache.save(dataset + '_codes', codes)
|
41
|
-
|
42
|
-
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
43
|
-
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
44
|
-
|
45
|
-
data = {}
|
46
|
-
codes.each_with_index{|code,i|
|
47
|
-
data[code.to_sym] = orders[i]
|
48
|
-
}
|
49
|
-
case
|
50
|
-
when codes.length < 65535
|
51
|
-
type = "SMALLINT UNSIGNED"
|
52
|
-
when codes.length < 16777215
|
53
|
-
type = "MEDIUMIN UNSIGNED"
|
54
|
-
else
|
55
|
-
type = "INT UNSIGNED"
|
56
|
-
end
|
57
|
-
|
58
|
-
DBcache.save(dataset + '_experiments', experiments)
|
59
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
60
|
-
nil
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.positions(dataset, genes)
|
64
|
-
return [{},[]] if genes.empty?
|
65
|
-
genes = genes.collect{|gene| gene.downcase.strip}
|
66
|
-
|
67
|
-
platform_entries = platform_entries(dataset + '_codes').to_f
|
68
|
-
|
69
|
-
data = {}
|
70
|
-
matched = []
|
71
|
-
|
72
|
-
gene_positions = DBcache.load(dataset, genes)
|
73
|
-
matched ||= gene_positions.keys
|
74
|
-
|
75
|
-
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
76
|
-
a[0].to_i <=> b[0].to_i
|
77
|
-
}.collect{|p|
|
78
|
-
Object::GEO::clean(dataset) + ": " + p[1].first
|
79
|
-
}
|
80
|
-
|
81
|
-
|
82
|
-
matched = (matched + gene_positions.keys).uniq
|
83
|
-
scale = (0..experiments.length - 1).collect{|i|
|
84
|
-
rows = DBcache.num_rows(dataset, "C#{i}");
|
85
|
-
if rows > 0
|
86
|
-
platform_entries / rows
|
87
|
-
else
|
88
|
-
nil
|
89
|
-
end
|
90
|
-
}
|
91
|
-
|
92
|
-
gene_x_experiment = gene_positions.values
|
93
|
-
|
94
|
-
experiment_x_gene = gene_x_experiment.transpose
|
95
|
-
|
96
|
-
experiments.each_with_index{|experiment, i|
|
97
|
-
next if scale[i].nil? || experiment_x_gene[i].nil?
|
98
|
-
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
99
|
-
data[experiment] = values
|
100
|
-
}
|
101
|
-
|
102
|
-
[data, matched]
|
103
|
-
end
|
104
7
|
|
105
|
-
|
106
|
-
|
8
|
+
# {{{ Saving Positions
|
9
|
+
|
10
|
+
# Save the actual data, cross_platform or not
|
11
|
+
def self.save_dataset_instance(dataset, cross_platform)
|
12
|
+
dataset += '_cross_platform' if cross_platform
|
13
|
+
prefix = MARQ::Dataset.path(dataset)
|
14
|
+
|
15
|
+
# Save codes
|
16
|
+
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
17
|
+
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
18
|
+
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
19
|
+
|
20
|
+
# Save codes and experiments
|
21
|
+
DBcache.save(dataset + '_codes', codes)
|
22
|
+
DBcache.save(dataset + '_experiments', experiments)
|
23
|
+
|
24
|
+
# Save orders
|
25
|
+
data = {}
|
26
|
+
codes.each_with_index{|code,i|
|
27
|
+
data[code.to_sym] = orders[i]
|
28
|
+
}
|
29
|
+
case
|
30
|
+
when codes.length < 65535
|
31
|
+
type = "SMALLINT UNSIGNED"
|
32
|
+
when codes.length < 16777215
|
33
|
+
type = "MEDIUMIN UNSIGNED"
|
34
|
+
else
|
35
|
+
type = "INT UNSIGNED"
|
107
36
|
end
|
108
|
-
end
|
109
|
-
|
110
37
|
|
38
|
+
DBcache.save(dataset, data, [type] * orders.first.length)
|
39
|
+
end
|
111
40
|
|
41
|
+
# Save dataset, all instances, cross_platform if available.
|
42
|
+
def self.save_dataset(dataset)
|
43
|
+
save_dataset_instance(dataset, false)
|
44
|
+
save_dataset_instance(dataset, true) if MARQ::Dataset.has_cross_platform?(dataset)
|
45
|
+
nil
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.save_platform(platform)
|
49
|
+
datasets = MARQ::Platform.datasets(platform).sort
|
50
|
+
return if datasets.empty?
|
112
51
|
|
52
|
+
DBcache.save(platform + '_codes', MARQ::Platform.codes(platform))
|
53
|
+
DBcache.save(platform + '_codes', MARQ::Platform.cross_platform(platform)) if MARQ::Platform.has_cross_platform? platform
|
113
54
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
return if datasets.empty?
|
119
|
-
|
120
|
-
|
121
|
-
codes = File.open(File.join(Object::GEO.platform_path(platform),'codes')).collect{|l| l.chomp.downcase}
|
122
|
-
|
123
|
-
DBcache.save(platform, codes)
|
55
|
+
datasets.sort.each do |dataset|
|
56
|
+
save_dataset(dataset)
|
57
|
+
end
|
58
|
+
end
|
124
59
|
|
125
|
-
|
126
|
-
path = Object::GEO.dataset_path(dataset)
|
127
|
-
experiments = File.open(path + '.experiments').collect{|l| l.chomp}
|
128
|
-
orders = File.open(path + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
60
|
+
# {{{ Loading Positions
|
129
61
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
}
|
134
|
-
case
|
135
|
-
when codes.length < 65535
|
136
|
-
type = "SMALLINT UNSIGNED"
|
137
|
-
when codes.length < 16777215
|
138
|
-
type = "MEDIUMINT UNSIGNED"
|
139
|
-
else
|
140
|
-
type = "INT UNSIGNED"
|
141
|
-
end
|
62
|
+
def self.platform_entries(platform)
|
63
|
+
DBcache.num_rows(platform).to_i
|
64
|
+
end
|
142
65
|
|
143
|
-
DBcache.save(dataset + '_experiments', experiments)
|
144
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
145
|
-
}
|
146
66
|
|
67
|
+
def self.load_positions(dataset, genes, platform_entries)
|
68
|
+
gene_positions = DBcache.load(dataset, genes)
|
69
|
+
data = {}
|
70
|
+
matched = gene_positions.keys
|
71
|
+
|
72
|
+
# Get experiments
|
73
|
+
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
74
|
+
a[0].to_i <=> b[0].to_i
|
75
|
+
}.collect{|p|
|
76
|
+
MARQ::Dataset.clean(dataset) + ": " + p[1].first
|
77
|
+
}
|
78
|
+
|
79
|
+
# Get scale factors (to account for genes missing in the dataset)
|
80
|
+
scale = (0..experiments.length - 1).collect{|i|
|
81
|
+
rows = DBcache.num_rows(dataset, "C#{i}");
|
82
|
+
if rows > 0
|
83
|
+
platform_entries / rows
|
84
|
+
else
|
85
|
+
nil
|
86
|
+
end
|
87
|
+
}
|
88
|
+
|
89
|
+
# Get experiment positions and scale them
|
90
|
+
experiment_x_gene = gene_positions.values.transpose
|
91
|
+
experiments.each_with_index{|experiment, i|
|
92
|
+
next if scale[i].nil? || experiment_x_gene[i].nil?
|
93
|
+
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
94
|
+
data[experiment] = values
|
95
|
+
}
|
96
|
+
|
97
|
+
[data, matched, platform_entries]
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.dataset_positions(dataset, genes)
|
101
|
+
return [{},[],0] if genes.empty?
|
147
102
|
|
148
|
-
|
149
|
-
|
103
|
+
genes = genes.collect{|gene| gene.downcase.strip}
|
104
|
+
platform_entries = platform_entries(dataset + '_codes').to_f
|
150
105
|
|
151
|
-
|
106
|
+
load_positions(dataset, genes, platform_entries)
|
107
|
+
end
|
152
108
|
|
153
|
-
Progress.monitor("Saving #{ platform }")
|
154
|
-
datasets.sort.each{|dataset|
|
155
|
-
path = Object::GEO.dataset_path(dataset)
|
156
|
-
next unless File.exists?(path + '_cross_platform.experiments')
|
157
|
-
experiments = File.open(path + '_cross_platform.experiments').collect{|l| l.chomp}
|
158
|
-
orders = File.open(path + '_cross_platform.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
159
109
|
|
160
|
-
|
161
|
-
|
162
|
-
data[code.to_sym] = orders[i]
|
163
|
-
}
|
110
|
+
def self.platform_positions(platform, genes)
|
111
|
+
return [{},[],0] if genes.empty?
|
164
112
|
|
165
|
-
|
166
|
-
|
167
|
-
type = "SMALLINT UNSIGNED"
|
168
|
-
when codes.length < 16777215
|
169
|
-
type = "MEDIUMIN UNSIGNED"
|
170
|
-
else
|
171
|
-
type = "INT UNSIGNED"
|
172
|
-
end
|
113
|
+
genes = genes.collect {|gene| gene.downcase.strip }
|
114
|
+
platform_entries = platform_entries(platform).to_f
|
173
115
|
|
116
|
+
cross_platform = MARQ::Platform.is_cross_platform? platform
|
117
|
+
datasets = MARQ::Platform.datasets(platform).sort
|
174
118
|
|
175
|
-
|
176
|
-
|
177
|
-
}
|
178
|
-
end
|
119
|
+
total_data = {}
|
120
|
+
total_matched = []
|
179
121
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
platform_entries = platform_entries(platform).to_f
|
186
|
-
|
187
|
-
data = {}
|
188
|
-
matched = nil
|
189
|
-
|
190
|
-
datasets.each{|dataset|
|
191
|
-
dataset += '_cross_platform' if Object::GEO::is_cross_platform?(platform)
|
192
|
-
gene_positions = DBcache.load(dataset, genes)
|
193
|
-
matched ||= gene_positions.keys
|
194
|
-
|
195
|
-
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
196
|
-
a[0].to_i <=> b[0].to_i
|
197
|
-
}.collect{|p|
|
198
|
-
Object::GEO::clean(dataset) + ": " + p[1].first
|
199
|
-
}
|
200
|
-
|
201
|
-
scale = (0..experiments.length - 1).collect{|i|
|
202
|
-
rows = DBcache.num_rows(dataset, "C#{i}");
|
203
|
-
if rows > 0
|
204
|
-
platform_entries / rows
|
205
|
-
else
|
206
|
-
nil
|
207
|
-
end
|
208
|
-
}
|
209
|
-
|
210
|
-
gene_x_experiment = gene_positions.values
|
211
|
-
|
212
|
-
experiment_x_gene = gene_x_experiment.transpose
|
213
|
-
|
214
|
-
experiments.each_with_index{|experiment, i|
|
215
|
-
next if scale[i].nil? || experiment_x_gene[i].nil?
|
216
|
-
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
217
|
-
data[experiment] = values
|
218
|
-
}
|
219
|
-
}
|
220
|
-
|
221
|
-
[data, matched]
|
122
|
+
datasets.each do |dataset|
|
123
|
+
dataset << '_cross_platform' if cross_platform
|
124
|
+
data, matched = load_positions(dataset, genes, platform_entries)
|
125
|
+
total_data = total_data.merge(data)
|
126
|
+
total_matched += matched
|
222
127
|
end
|
128
|
+
total_matched.uniq!
|
223
129
|
|
224
|
-
|
225
|
-
DBcache.num_rows(platform)
|
226
|
-
end
|
130
|
+
[total_data, total_matched, platform_entries]
|
227
131
|
end
|
228
132
|
|
229
133
|
end
|
@@ -232,7 +136,8 @@ if __FILE__ == $0
|
|
232
136
|
#CustomDS::datasets('sgd').each{|d| MADB::CustomDS::save(d)}
|
233
137
|
|
234
138
|
require 'pp'
|
235
|
-
pp MADB::
|
139
|
+
pp MADB::dataset_positions('GDS113', %w(2778))[0]
|
140
|
+
pp MADB::platform_positions('GPL54', %w(2778))[0]
|
236
141
|
#p MADB::CustomDS::positions("HaploidData",%w( YMR261c YDL140c YIL122w YPL093w YHR211w YDL142c YHR106w YOR103c YDR233c YLR181c yomeman))
|
237
142
|
#p MADB::CustomDS::positions("HaploidData_cross_platform",%w( S000002685 S000001149 S000003068 S000003153 S000003355 S000000127 S000004444 S000004875 S000001702 S000005843 S000000862))
|
238
143
|
end
|
data/lib/MARQ/annotations.rb
CHANGED
@@ -2,8 +2,12 @@ require 'inline'
|
|
2
2
|
require 'net/http'
|
3
3
|
require 'uri'
|
4
4
|
require 'MARQ'
|
5
|
-
require 'rbbt/bow/dictionary'
|
6
5
|
require 'MARQ/fdr'
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'base64'
|
8
|
+
require 'rbbt/util/open'
|
9
|
+
require 'MARQ/main'
|
10
|
+
require 'rbbt/bow/dictionary'
|
7
11
|
|
8
12
|
module Annotations
|
9
13
|
class << self
|
@@ -291,18 +295,25 @@ double hypergeometric(double total, double support, double list, double found)
|
|
291
295
|
end
|
292
296
|
|
293
297
|
|
298
|
+
@@terms_cache = {}
|
299
|
+
def self.dataset_annotations(dataset, type, experiment)
|
300
|
+
annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
|
301
|
+
|
302
|
+
term_file = File.join(annotation_dir, type, MARQ::Dataset.clean(dataset))
|
303
|
+
|
304
|
+
if File.exist? term_file
|
305
|
+
@@terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
306
|
+
terms = @@terms_cache[term_file]
|
307
|
+
{:dataset => (terms[:dataset] || []), :signature => (terms[experiment] || [])}
|
308
|
+
else
|
309
|
+
{:dataset => [], :signature => []}
|
310
|
+
end
|
311
|
+
end
|
294
312
|
|
295
313
|
def self.annotations(scores, type, pvalue = 0.05, algorithm = :rank)
|
296
314
|
annot = {}
|
297
315
|
relevant = []
|
298
316
|
|
299
|
-
dict_options = {}
|
300
|
-
if type == "Words"
|
301
|
-
dict_options = {:low => 0, :hi => 0.05, :limit => 100000}
|
302
|
-
else
|
303
|
-
dict_options = {:low => 0, :hi => 0.5, :limit => 100000}
|
304
|
-
end
|
305
|
-
|
306
317
|
case
|
307
318
|
when type =~ /^(.*)_direct$/
|
308
319
|
side = :direct
|
@@ -312,31 +323,31 @@ double hypergeometric(double total, double support, double list, double found)
|
|
312
323
|
type = $1
|
313
324
|
end
|
314
325
|
|
315
|
-
|
316
|
-
terms_cache = {}
|
317
326
|
scores.each{|experiment, info|
|
318
327
|
dataset = experiment.match(/^(.*?): /)[1]
|
319
328
|
name = $'.strip
|
329
|
+
|
320
330
|
case
|
321
331
|
when side.nil?
|
322
|
-
|
323
|
-
when side == :direct && info[:score]
|
324
|
-
|
332
|
+
experiment_type = type
|
333
|
+
when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
|
334
|
+
experiment_type += '_up'
|
325
335
|
else
|
326
|
-
|
336
|
+
experiment_type += '_down'
|
327
337
|
end
|
328
338
|
|
329
|
-
|
330
|
-
terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
331
|
-
terms = terms_cache[term_file]
|
332
|
-
annot[experiment] = {:dataset => (terms[:dataset] || []), :signature => (terms[name] || [])}
|
333
|
-
else
|
334
|
-
annot[experiment] = {:dataset => [], :signature => []}
|
335
|
-
end
|
339
|
+
annot[experiment] = dataset_annotations(dataset, experiment_type, name)
|
336
340
|
|
337
341
|
relevant << experiment if info[:pvalue] <= pvalue
|
338
342
|
}
|
339
343
|
|
344
|
+
dict_options = {}
|
345
|
+
if type == "Words"
|
346
|
+
dict_options = {:low => 0, :hi => 0.05, :limit => 100000}
|
347
|
+
else
|
348
|
+
dict_options = {:low => 0, :hi => 0.5, :limit => 100000}
|
349
|
+
end
|
350
|
+
|
340
351
|
if algorithm == :rank
|
341
352
|
ranks = scores.sort{|a,b| compare(a[1],b[1]) }.collect{|p| p[0]}
|
342
353
|
terms = enrichment_rank(annot, ranks, dict_options)
|
@@ -348,10 +359,11 @@ double hypergeometric(double total, double support, double list, double found)
|
|
348
359
|
annot.each{|key, info|
|
349
360
|
merged_annotations[key] = info[:dataset] + info[:signature]
|
350
361
|
}
|
362
|
+
|
351
363
|
[merged_annotations, terms]
|
352
364
|
end
|
353
365
|
|
354
|
-
module
|
366
|
+
module Genes
|
355
367
|
module Genecodis
|
356
368
|
ORGS = {
|
357
369
|
'sgd' => 'Sc' ,
|
@@ -435,8 +447,82 @@ double hypergeometric(double total, double support, double list, double found)
|
|
435
447
|
end
|
436
448
|
end
|
437
449
|
|
450
|
+
module SENT
|
451
|
+
|
452
|
+
class SENTError < StandardError; end
|
453
|
+
|
454
|
+
|
455
|
+
WSDL="http://sent.dacya.ucm.es/wsdl/SentWS.wsdl"
|
456
|
+
def self.driver
|
457
|
+
require 'soap/wsdlDriver'
|
458
|
+
driver = SOAP::WSDLDriverFactory.new(WSDL).create_rpc_driver
|
459
|
+
driver
|
460
|
+
end
|
461
|
+
|
462
|
+
def self.process_results(job)
|
463
|
+
result_ids = driver.results(job)
|
464
|
+
|
465
|
+
summary = YAML::load(Base64.decode64(driver.result(result_ids[0])))
|
466
|
+
ccc = Base64.decode64(driver.result(result_ids[1])).to_f
|
467
|
+
associations = Open.to_hash(StringIO.new(driver.associations(job)), :flatten => true)
|
468
|
+
|
469
|
+
summary.each do |group|
|
470
|
+
group[:articles] = group[:genes].inject(0) {|acc, gene| acc += associations[gene].length}
|
471
|
+
end
|
472
|
+
|
473
|
+
[summary, ccc]
|
474
|
+
end
|
475
|
+
|
476
|
+
@@jobs = {}
|
477
|
+
def self.analyze(organism, genes, factors)
|
478
|
+
hash = Digest::MD5.hexdigest([organism, genes.sort].inspect)
|
479
|
+
|
480
|
+
if @@jobs[hash]
|
481
|
+
orig_job = @@jobs[hash]
|
482
|
+
job = driver.refactor(orig_job, factors, 'MARQ')
|
483
|
+
else
|
484
|
+
job = driver.analyze(organism, genes, factors, 'MARQ')
|
485
|
+
orig_job = job
|
486
|
+
end
|
487
|
+
|
488
|
+
puts "#{ job }: #{ factors }"
|
489
|
+
|
490
|
+
while ! driver.done(job)
|
491
|
+
sleep 5
|
492
|
+
end
|
493
|
+
|
494
|
+
raise SENT::SENTError, "Job failed with error #{driver.messages(job).last}" if driver.error(job)
|
495
|
+
@@jobs[hash] = job
|
496
|
+
|
497
|
+
summary, ccc = process_results(orig_job)
|
498
|
+
end
|
499
|
+
|
500
|
+
def self.terms(organism, genes, num = 20)
|
501
|
+
factor_list = [2,4,8,10]
|
502
|
+
|
503
|
+
terms = {}
|
504
|
+
cccs = {}
|
505
|
+
factor_list.each do |factors|
|
506
|
+
summary, ccc = analyze(organism, genes, factors)
|
507
|
+
articles = summary.inject(0) {|acc, group| acc += group[:articles] }
|
508
|
+
terms_per_article = num.to_f / articles
|
509
|
+
summary.each{|group|
|
510
|
+
num_terms = [terms_per_article * group[:articles], group[:words].length].min
|
511
|
+
terms[factors] ||= []
|
512
|
+
terms[factors] += group[:words][0..(num_terms - 1)]
|
513
|
+
p terms
|
514
|
+
}
|
515
|
+
cccs[factors] = ccc
|
516
|
+
end
|
517
|
+
|
518
|
+
best_k = cccs.sort_by{|p| p[1]}.first[1]
|
519
|
+
|
520
|
+
terms[k]
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
438
524
|
def self.get_genes_nth(dataset, num_genes)
|
439
|
-
path = MARQ.
|
525
|
+
path = MARQ::Dataset.path(dataset)
|
440
526
|
|
441
527
|
experiments = File.open(path + '.experiments').collect{|l| l.chomp.strip}
|
442
528
|
genes = File.open(path + '.codes').collect{|l| l.chomp.strip}
|
@@ -472,7 +558,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
472
558
|
if nth_genes > 0
|
473
559
|
return get_genes_nth(dataset, nth_genes)
|
474
560
|
end
|
475
|
-
|
561
|
+
|
476
562
|
|
477
563
|
path = MARQ.dataset_path(dataset)
|
478
564
|
|
@@ -509,7 +595,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
509
595
|
values_down[name] << (p.last < 0 ? - value : 0)
|
510
596
|
}
|
511
597
|
}
|
512
|
-
|
598
|
+
|
513
599
|
genes_up = {}
|
514
600
|
genes_down = {}
|
515
601
|
|
@@ -545,7 +631,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
545
631
|
|
546
632
|
def self.get_genes_old(dataset, cut_off = 0.1, fdr = false)
|
547
633
|
|
548
|
-
path = MARQ.
|
634
|
+
path = MARQ::Dataset.path(dataset)
|
549
635
|
|
550
636
|
experiments = File.open(path + '.experiments').collect{|l| l.chomp.strip}.select{|name| !name.match(/\[ratio\]/)}
|
551
637
|
genes = File.open(path + '.codes').collect{|l| l.chomp.strip}
|
@@ -598,7 +684,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
598
684
|
def self.OBA(text)
|
599
685
|
|
600
686
|
res = Net::HTTP.post_form(URI.parse('http://rest.bioontology.org/obs_hibernate/annotator'),
|
601
|
-
|
687
|
+
{
|
602
688
|
'longestOnly'=> true,
|
603
689
|
'wholeWordOnly'=> true,
|
604
690
|
'withDefaultStopWords' => true,
|
@@ -626,11 +712,11 @@ end
|
|
626
712
|
if __FILE__ == $0
|
627
713
|
require 'pp'
|
628
714
|
|
629
|
-
|
630
|
-
|
715
|
+
|
716
|
+
|
631
717
|
exit
|
632
718
|
#Annotations::GO::Genecodis::Local.init
|
633
|
-
|
719
|
+
|
634
720
|
#genes = Annotations::GO::get_genes('GDS1916')
|
635
721
|
#genes[:up].each{|exp, genes|
|
636
722
|
# puts exp
|
@@ -647,15 +733,15 @@ if __FILE__ == $0
|
|
647
733
|
and 24 hours following treatment with 4 mg/kg body weight GH. Results provide
|
648
734
|
insight into the insulin-like growth factor-I dependent and independent
|
649
735
|
pathways that mediate the action of GH in bone.
|
650
|
-
|
736
|
+
|
651
737
|
EOT
|
652
738
|
texts << <<-EOT
|
653
|
-
|
739
|
+
|
654
740
|
Comparison of total transcription profiles for temperature-sensitive TOR2
|
655
741
|
mutant strain SH121 to its isogenic wild type counterpart SH100. Results
|
656
742
|
indicate that TOR2 inactivation leads to enhanced transcription of
|
657
743
|
Gcn4-controlled target genes.
|
658
|
-
|
744
|
+
|
659
745
|
|
660
746
|
EOT
|
661
747
|
texts << <<-EOT
|
@@ -664,8 +750,8 @@ if __FILE__ == $0
|
|
664
750
|
melanoma in situ, vertical growth phase (VGP) melanoma, and metastatic growth
|
665
751
|
phase (MGP) melanoma. Results identify expression signatures that distinguish
|
666
752
|
benign and atypical nevi and melanomas in situ from VGPs and MGPs.
|
667
|
-
|
668
|
-
|
753
|
+
|
754
|
+
|
669
755
|
EOT
|
670
756
|
texts << <<-EOT
|
671
757
|
|
@@ -678,7 +764,7 @@ if __FILE__ == $0
|
|
678
764
|
EOT
|
679
765
|
texts << <<-EOT
|
680
766
|
|
681
|
-
|
767
|
+
|
682
768
|
Analysis of anaerobic chemostat cultures of Saccharomyces cerevisae exposed
|
683
769
|
to one of several weak organic acids. Weak organic acids are used as
|
684
770
|
preservatives in food and beverages. Yeasts are able to proliferate at the
|
@@ -721,7 +807,7 @@ if __FILE__ == $0
|
|
721
807
|
key component of host defense. Results suggest most of the host response to
|
722
808
|
endotoxin or live bacteria is actually regulated independently of MyD88.
|
723
809
|
|
724
|
-
|
810
|
+
|
725
811
|
EOT
|
726
812
|
texts.reverse.each{|text|
|
727
813
|
puts "\n\n--------------\n"
|
@@ -732,7 +818,7 @@ if __FILE__ == $0
|
|
732
818
|
puts Annotations::UMLS::OBA(text).join(", ")
|
733
819
|
}
|
734
820
|
|
735
|
-
#
|
821
|
+
#
|
736
822
|
|
737
823
|
#puts Annotations.hypergeometric(2000,100,100,2)
|
738
824
|
#p Annotations::GO::annotate(MARQ.platform_organism('GDS1365'),genes[:up].collect.first.last[1..100])
|