rbbt-marq 1.1.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/R/GEO.R +6 -4
- data/R/MA.R +1 -0
- data/bin/marq_config +4 -3
- data/install_scripts/CustomDS/Rakefile +27 -215
- data/install_scripts/GEO/Rakefile +34 -275
- data/install_scripts/rake_includes.rb +236 -0
- data/lib/MARQ/CustomDS.rb +63 -32
- data/lib/MARQ/GEO.rb +99 -63
- data/lib/MARQ/MADB.rb +107 -202
- data/lib/MARQ/annotations.rb +124 -38
- data/lib/MARQ/main.rb +152 -160
- data/lib/MARQ/rankproduct.rb +20 -34
- data/tasks/install.rake +7 -2
- metadata +3 -2
data/lib/MARQ/MADB.rb
CHANGED
@@ -1,229 +1,133 @@
|
|
1
1
|
require 'MARQ'
|
2
2
|
require 'MARQ/GEO'
|
3
3
|
require 'MARQ/CustomDS'
|
4
|
+
require 'MARQ/main'
|
4
5
|
|
5
6
|
module MADB
|
6
|
-
module CustomDS
|
7
|
-
|
8
|
-
def self.save(dataset)
|
9
|
-
prefix = Object::CustomDS.path(dataset)
|
10
|
-
|
11
|
-
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
12
|
-
|
13
|
-
DBcache.save(dataset + '_codes', codes)
|
14
|
-
|
15
|
-
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
16
|
-
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
17
|
-
|
18
|
-
data = {}
|
19
|
-
codes.each_with_index{|code,i|
|
20
|
-
data[code.to_sym] = orders[i]
|
21
|
-
}
|
22
|
-
case
|
23
|
-
when codes.length < 65535
|
24
|
-
type = "SMALLINT UNSIGNED"
|
25
|
-
when codes.length < 16777215
|
26
|
-
type = "MEDIUMIN UNSIGNED"
|
27
|
-
else
|
28
|
-
type = "INT UNSIGNED"
|
29
|
-
end
|
30
|
-
|
31
|
-
DBcache.save(dataset + '_experiments', experiments)
|
32
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
33
|
-
|
34
|
-
return unless Object::CustomDS::has_cross_platform?(dataset)
|
35
|
-
dataset = dataset + '_cross_platform'
|
36
|
-
prefix = Object::CustomDS.path(dataset)
|
37
|
-
|
38
|
-
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
39
|
-
|
40
|
-
DBcache.save(dataset + '_codes', codes)
|
41
|
-
|
42
|
-
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
43
|
-
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
44
|
-
|
45
|
-
data = {}
|
46
|
-
codes.each_with_index{|code,i|
|
47
|
-
data[code.to_sym] = orders[i]
|
48
|
-
}
|
49
|
-
case
|
50
|
-
when codes.length < 65535
|
51
|
-
type = "SMALLINT UNSIGNED"
|
52
|
-
when codes.length < 16777215
|
53
|
-
type = "MEDIUMIN UNSIGNED"
|
54
|
-
else
|
55
|
-
type = "INT UNSIGNED"
|
56
|
-
end
|
57
|
-
|
58
|
-
DBcache.save(dataset + '_experiments', experiments)
|
59
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
60
|
-
nil
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.positions(dataset, genes)
|
64
|
-
return [{},[]] if genes.empty?
|
65
|
-
genes = genes.collect{|gene| gene.downcase.strip}
|
66
|
-
|
67
|
-
platform_entries = platform_entries(dataset + '_codes').to_f
|
68
|
-
|
69
|
-
data = {}
|
70
|
-
matched = []
|
71
|
-
|
72
|
-
gene_positions = DBcache.load(dataset, genes)
|
73
|
-
matched ||= gene_positions.keys
|
74
|
-
|
75
|
-
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
76
|
-
a[0].to_i <=> b[0].to_i
|
77
|
-
}.collect{|p|
|
78
|
-
Object::GEO::clean(dataset) + ": " + p[1].first
|
79
|
-
}
|
80
|
-
|
81
|
-
|
82
|
-
matched = (matched + gene_positions.keys).uniq
|
83
|
-
scale = (0..experiments.length - 1).collect{|i|
|
84
|
-
rows = DBcache.num_rows(dataset, "C#{i}");
|
85
|
-
if rows > 0
|
86
|
-
platform_entries / rows
|
87
|
-
else
|
88
|
-
nil
|
89
|
-
end
|
90
|
-
}
|
91
|
-
|
92
|
-
gene_x_experiment = gene_positions.values
|
93
|
-
|
94
|
-
experiment_x_gene = gene_x_experiment.transpose
|
95
|
-
|
96
|
-
experiments.each_with_index{|experiment, i|
|
97
|
-
next if scale[i].nil? || experiment_x_gene[i].nil?
|
98
|
-
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
99
|
-
data[experiment] = values
|
100
|
-
}
|
101
|
-
|
102
|
-
[data, matched]
|
103
|
-
end
|
104
7
|
|
105
|
-
|
106
|
-
|
8
|
+
# {{{ Saving Positions
|
9
|
+
|
10
|
+
# Save the actual data, cross_platform or not
|
11
|
+
def self.save_dataset_instance(dataset, cross_platform)
|
12
|
+
dataset += '_cross_platform' if cross_platform
|
13
|
+
prefix = MARQ::Dataset.path(dataset)
|
14
|
+
|
15
|
+
# Save codes
|
16
|
+
codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
|
17
|
+
experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
|
18
|
+
orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
19
|
+
|
20
|
+
# Save codes and experiments
|
21
|
+
DBcache.save(dataset + '_codes', codes)
|
22
|
+
DBcache.save(dataset + '_experiments', experiments)
|
23
|
+
|
24
|
+
# Save orders
|
25
|
+
data = {}
|
26
|
+
codes.each_with_index{|code,i|
|
27
|
+
data[code.to_sym] = orders[i]
|
28
|
+
}
|
29
|
+
case
|
30
|
+
when codes.length < 65535
|
31
|
+
type = "SMALLINT UNSIGNED"
|
32
|
+
when codes.length < 16777215
|
33
|
+
type = "MEDIUMIN UNSIGNED"
|
34
|
+
else
|
35
|
+
type = "INT UNSIGNED"
|
107
36
|
end
|
108
|
-
end
|
109
|
-
|
110
37
|
|
38
|
+
DBcache.save(dataset, data, [type] * orders.first.length)
|
39
|
+
end
|
111
40
|
|
41
|
+
# Save dataset, all instances, cross_platform if available.
|
42
|
+
def self.save_dataset(dataset)
|
43
|
+
save_dataset_instance(dataset, false)
|
44
|
+
save_dataset_instance(dataset, true) if MARQ::Dataset.has_cross_platform?(dataset)
|
45
|
+
nil
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.save_platform(platform)
|
49
|
+
datasets = MARQ::Platform.datasets(platform).sort
|
50
|
+
return if datasets.empty?
|
112
51
|
|
52
|
+
DBcache.save(platform + '_codes', MARQ::Platform.codes(platform))
|
53
|
+
DBcache.save(platform + '_codes', MARQ::Platform.cross_platform(platform)) if MARQ::Platform.has_cross_platform? platform
|
113
54
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
return if datasets.empty?
|
119
|
-
|
120
|
-
|
121
|
-
codes = File.open(File.join(Object::GEO.platform_path(platform),'codes')).collect{|l| l.chomp.downcase}
|
122
|
-
|
123
|
-
DBcache.save(platform, codes)
|
55
|
+
datasets.sort.each do |dataset|
|
56
|
+
save_dataset(dataset)
|
57
|
+
end
|
58
|
+
end
|
124
59
|
|
125
|
-
|
126
|
-
path = Object::GEO.dataset_path(dataset)
|
127
|
-
experiments = File.open(path + '.experiments').collect{|l| l.chomp}
|
128
|
-
orders = File.open(path + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
60
|
+
# {{{ Loading Positions
|
129
61
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
}
|
134
|
-
case
|
135
|
-
when codes.length < 65535
|
136
|
-
type = "SMALLINT UNSIGNED"
|
137
|
-
when codes.length < 16777215
|
138
|
-
type = "MEDIUMINT UNSIGNED"
|
139
|
-
else
|
140
|
-
type = "INT UNSIGNED"
|
141
|
-
end
|
62
|
+
def self.platform_entries(platform)
|
63
|
+
DBcache.num_rows(platform).to_i
|
64
|
+
end
|
142
65
|
|
143
|
-
DBcache.save(dataset + '_experiments', experiments)
|
144
|
-
DBcache.save(dataset, data, [type] * orders.first.length)
|
145
|
-
}
|
146
66
|
|
67
|
+
def self.load_positions(dataset, genes, platform_entries)
|
68
|
+
gene_positions = DBcache.load(dataset, genes)
|
69
|
+
data = {}
|
70
|
+
matched = gene_positions.keys
|
71
|
+
|
72
|
+
# Get experiments
|
73
|
+
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
74
|
+
a[0].to_i <=> b[0].to_i
|
75
|
+
}.collect{|p|
|
76
|
+
MARQ::Dataset.clean(dataset) + ": " + p[1].first
|
77
|
+
}
|
78
|
+
|
79
|
+
# Get scale factors (to account for genes missing in the dataset)
|
80
|
+
scale = (0..experiments.length - 1).collect{|i|
|
81
|
+
rows = DBcache.num_rows(dataset, "C#{i}");
|
82
|
+
if rows > 0
|
83
|
+
platform_entries / rows
|
84
|
+
else
|
85
|
+
nil
|
86
|
+
end
|
87
|
+
}
|
88
|
+
|
89
|
+
# Get experiment positions and scale them
|
90
|
+
experiment_x_gene = gene_positions.values.transpose
|
91
|
+
experiments.each_with_index{|experiment, i|
|
92
|
+
next if scale[i].nil? || experiment_x_gene[i].nil?
|
93
|
+
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
94
|
+
data[experiment] = values
|
95
|
+
}
|
96
|
+
|
97
|
+
[data, matched, platform_entries]
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.dataset_positions(dataset, genes)
|
101
|
+
return [{},[],0] if genes.empty?
|
147
102
|
|
148
|
-
|
149
|
-
|
103
|
+
genes = genes.collect{|gene| gene.downcase.strip}
|
104
|
+
platform_entries = platform_entries(dataset + '_codes').to_f
|
150
105
|
|
151
|
-
|
106
|
+
load_positions(dataset, genes, platform_entries)
|
107
|
+
end
|
152
108
|
|
153
|
-
Progress.monitor("Saving #{ platform }")
|
154
|
-
datasets.sort.each{|dataset|
|
155
|
-
path = Object::GEO.dataset_path(dataset)
|
156
|
-
next unless File.exists?(path + '_cross_platform.experiments')
|
157
|
-
experiments = File.open(path + '_cross_platform.experiments').collect{|l| l.chomp}
|
158
|
-
orders = File.open(path + '_cross_platform.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
|
159
109
|
|
160
|
-
|
161
|
-
|
162
|
-
data[code.to_sym] = orders[i]
|
163
|
-
}
|
110
|
+
def self.platform_positions(platform, genes)
|
111
|
+
return [{},[],0] if genes.empty?
|
164
112
|
|
165
|
-
|
166
|
-
|
167
|
-
type = "SMALLINT UNSIGNED"
|
168
|
-
when codes.length < 16777215
|
169
|
-
type = "MEDIUMIN UNSIGNED"
|
170
|
-
else
|
171
|
-
type = "INT UNSIGNED"
|
172
|
-
end
|
113
|
+
genes = genes.collect {|gene| gene.downcase.strip }
|
114
|
+
platform_entries = platform_entries(platform).to_f
|
173
115
|
|
116
|
+
cross_platform = MARQ::Platform.is_cross_platform? platform
|
117
|
+
datasets = MARQ::Platform.datasets(platform).sort
|
174
118
|
|
175
|
-
|
176
|
-
|
177
|
-
}
|
178
|
-
end
|
119
|
+
total_data = {}
|
120
|
+
total_matched = []
|
179
121
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
platform_entries = platform_entries(platform).to_f
|
186
|
-
|
187
|
-
data = {}
|
188
|
-
matched = nil
|
189
|
-
|
190
|
-
datasets.each{|dataset|
|
191
|
-
dataset += '_cross_platform' if Object::GEO::is_cross_platform?(platform)
|
192
|
-
gene_positions = DBcache.load(dataset, genes)
|
193
|
-
matched ||= gene_positions.keys
|
194
|
-
|
195
|
-
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
196
|
-
a[0].to_i <=> b[0].to_i
|
197
|
-
}.collect{|p|
|
198
|
-
Object::GEO::clean(dataset) + ": " + p[1].first
|
199
|
-
}
|
200
|
-
|
201
|
-
scale = (0..experiments.length - 1).collect{|i|
|
202
|
-
rows = DBcache.num_rows(dataset, "C#{i}");
|
203
|
-
if rows > 0
|
204
|
-
platform_entries / rows
|
205
|
-
else
|
206
|
-
nil
|
207
|
-
end
|
208
|
-
}
|
209
|
-
|
210
|
-
gene_x_experiment = gene_positions.values
|
211
|
-
|
212
|
-
experiment_x_gene = gene_x_experiment.transpose
|
213
|
-
|
214
|
-
experiments.each_with_index{|experiment, i|
|
215
|
-
next if scale[i].nil? || experiment_x_gene[i].nil?
|
216
|
-
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
217
|
-
data[experiment] = values
|
218
|
-
}
|
219
|
-
}
|
220
|
-
|
221
|
-
[data, matched]
|
122
|
+
datasets.each do |dataset|
|
123
|
+
dataset << '_cross_platform' if cross_platform
|
124
|
+
data, matched = load_positions(dataset, genes, platform_entries)
|
125
|
+
total_data = total_data.merge(data)
|
126
|
+
total_matched += matched
|
222
127
|
end
|
128
|
+
total_matched.uniq!
|
223
129
|
|
224
|
-
|
225
|
-
DBcache.num_rows(platform)
|
226
|
-
end
|
130
|
+
[total_data, total_matched, platform_entries]
|
227
131
|
end
|
228
132
|
|
229
133
|
end
|
@@ -232,7 +136,8 @@ if __FILE__ == $0
|
|
232
136
|
#CustomDS::datasets('sgd').each{|d| MADB::CustomDS::save(d)}
|
233
137
|
|
234
138
|
require 'pp'
|
235
|
-
pp MADB::
|
139
|
+
pp MADB::dataset_positions('GDS113', %w(2778))[0]
|
140
|
+
pp MADB::platform_positions('GPL54', %w(2778))[0]
|
236
141
|
#p MADB::CustomDS::positions("HaploidData",%w( YMR261c YDL140c YIL122w YPL093w YHR211w YDL142c YHR106w YOR103c YDR233c YLR181c yomeman))
|
237
142
|
#p MADB::CustomDS::positions("HaploidData_cross_platform",%w( S000002685 S000001149 S000003068 S000003153 S000003355 S000000127 S000004444 S000004875 S000001702 S000005843 S000000862))
|
238
143
|
end
|
data/lib/MARQ/annotations.rb
CHANGED
@@ -2,8 +2,12 @@ require 'inline'
|
|
2
2
|
require 'net/http'
|
3
3
|
require 'uri'
|
4
4
|
require 'MARQ'
|
5
|
-
require 'rbbt/bow/dictionary'
|
6
5
|
require 'MARQ/fdr'
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'base64'
|
8
|
+
require 'rbbt/util/open'
|
9
|
+
require 'MARQ/main'
|
10
|
+
require 'rbbt/bow/dictionary'
|
7
11
|
|
8
12
|
module Annotations
|
9
13
|
class << self
|
@@ -291,18 +295,25 @@ double hypergeometric(double total, double support, double list, double found)
|
|
291
295
|
end
|
292
296
|
|
293
297
|
|
298
|
+
@@terms_cache = {}
|
299
|
+
def self.dataset_annotations(dataset, type, experiment)
|
300
|
+
annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
|
301
|
+
|
302
|
+
term_file = File.join(annotation_dir, type, MARQ::Dataset.clean(dataset))
|
303
|
+
|
304
|
+
if File.exist? term_file
|
305
|
+
@@terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
306
|
+
terms = @@terms_cache[term_file]
|
307
|
+
{:dataset => (terms[:dataset] || []), :signature => (terms[experiment] || [])}
|
308
|
+
else
|
309
|
+
{:dataset => [], :signature => []}
|
310
|
+
end
|
311
|
+
end
|
294
312
|
|
295
313
|
def self.annotations(scores, type, pvalue = 0.05, algorithm = :rank)
|
296
314
|
annot = {}
|
297
315
|
relevant = []
|
298
316
|
|
299
|
-
dict_options = {}
|
300
|
-
if type == "Words"
|
301
|
-
dict_options = {:low => 0, :hi => 0.05, :limit => 100000}
|
302
|
-
else
|
303
|
-
dict_options = {:low => 0, :hi => 0.5, :limit => 100000}
|
304
|
-
end
|
305
|
-
|
306
317
|
case
|
307
318
|
when type =~ /^(.*)_direct$/
|
308
319
|
side = :direct
|
@@ -312,31 +323,31 @@ double hypergeometric(double total, double support, double list, double found)
|
|
312
323
|
type = $1
|
313
324
|
end
|
314
325
|
|
315
|
-
|
316
|
-
terms_cache = {}
|
317
326
|
scores.each{|experiment, info|
|
318
327
|
dataset = experiment.match(/^(.*?): /)[1]
|
319
328
|
name = $'.strip
|
329
|
+
|
320
330
|
case
|
321
331
|
when side.nil?
|
322
|
-
|
323
|
-
when side == :direct && info[:score]
|
324
|
-
|
332
|
+
experiment_type = type
|
333
|
+
when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
|
334
|
+
experiment_type += '_up'
|
325
335
|
else
|
326
|
-
|
336
|
+
experiment_type += '_down'
|
327
337
|
end
|
328
338
|
|
329
|
-
|
330
|
-
terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
331
|
-
terms = terms_cache[term_file]
|
332
|
-
annot[experiment] = {:dataset => (terms[:dataset] || []), :signature => (terms[name] || [])}
|
333
|
-
else
|
334
|
-
annot[experiment] = {:dataset => [], :signature => []}
|
335
|
-
end
|
339
|
+
annot[experiment] = dataset_annotations(dataset, experiment_type, name)
|
336
340
|
|
337
341
|
relevant << experiment if info[:pvalue] <= pvalue
|
338
342
|
}
|
339
343
|
|
344
|
+
dict_options = {}
|
345
|
+
if type == "Words"
|
346
|
+
dict_options = {:low => 0, :hi => 0.05, :limit => 100000}
|
347
|
+
else
|
348
|
+
dict_options = {:low => 0, :hi => 0.5, :limit => 100000}
|
349
|
+
end
|
350
|
+
|
340
351
|
if algorithm == :rank
|
341
352
|
ranks = scores.sort{|a,b| compare(a[1],b[1]) }.collect{|p| p[0]}
|
342
353
|
terms = enrichment_rank(annot, ranks, dict_options)
|
@@ -348,10 +359,11 @@ double hypergeometric(double total, double support, double list, double found)
|
|
348
359
|
annot.each{|key, info|
|
349
360
|
merged_annotations[key] = info[:dataset] + info[:signature]
|
350
361
|
}
|
362
|
+
|
351
363
|
[merged_annotations, terms]
|
352
364
|
end
|
353
365
|
|
354
|
-
module
|
366
|
+
module Genes
|
355
367
|
module Genecodis
|
356
368
|
ORGS = {
|
357
369
|
'sgd' => 'Sc' ,
|
@@ -435,8 +447,82 @@ double hypergeometric(double total, double support, double list, double found)
|
|
435
447
|
end
|
436
448
|
end
|
437
449
|
|
450
|
+
module SENT
|
451
|
+
|
452
|
+
class SENTError < StandardError; end
|
453
|
+
|
454
|
+
|
455
|
+
WSDL="http://sent.dacya.ucm.es/wsdl/SentWS.wsdl"
|
456
|
+
def self.driver
|
457
|
+
require 'soap/wsdlDriver'
|
458
|
+
driver = SOAP::WSDLDriverFactory.new(WSDL).create_rpc_driver
|
459
|
+
driver
|
460
|
+
end
|
461
|
+
|
462
|
+
def self.process_results(job)
|
463
|
+
result_ids = driver.results(job)
|
464
|
+
|
465
|
+
summary = YAML::load(Base64.decode64(driver.result(result_ids[0])))
|
466
|
+
ccc = Base64.decode64(driver.result(result_ids[1])).to_f
|
467
|
+
associations = Open.to_hash(StringIO.new(driver.associations(job)), :flatten => true)
|
468
|
+
|
469
|
+
summary.each do |group|
|
470
|
+
group[:articles] = group[:genes].inject(0) {|acc, gene| acc += associations[gene].length}
|
471
|
+
end
|
472
|
+
|
473
|
+
[summary, ccc]
|
474
|
+
end
|
475
|
+
|
476
|
+
@@jobs = {}
|
477
|
+
def self.analyze(organism, genes, factors)
|
478
|
+
hash = Digest::MD5.hexdigest([organism, genes.sort].inspect)
|
479
|
+
|
480
|
+
if @@jobs[hash]
|
481
|
+
orig_job = @@jobs[hash]
|
482
|
+
job = driver.refactor(orig_job, factors, 'MARQ')
|
483
|
+
else
|
484
|
+
job = driver.analyze(organism, genes, factors, 'MARQ')
|
485
|
+
orig_job = job
|
486
|
+
end
|
487
|
+
|
488
|
+
puts "#{ job }: #{ factors }"
|
489
|
+
|
490
|
+
while ! driver.done(job)
|
491
|
+
sleep 5
|
492
|
+
end
|
493
|
+
|
494
|
+
raise SENT::SENTError, "Job failed with error #{driver.messages(job).last}" if driver.error(job)
|
495
|
+
@@jobs[hash] = job
|
496
|
+
|
497
|
+
summary, ccc = process_results(orig_job)
|
498
|
+
end
|
499
|
+
|
500
|
+
def self.terms(organism, genes, num = 20)
|
501
|
+
factor_list = [2,4,8,10]
|
502
|
+
|
503
|
+
terms = {}
|
504
|
+
cccs = {}
|
505
|
+
factor_list.each do |factors|
|
506
|
+
summary, ccc = analyze(organism, genes, factors)
|
507
|
+
articles = summary.inject(0) {|acc, group| acc += group[:articles] }
|
508
|
+
terms_per_article = num.to_f / articles
|
509
|
+
summary.each{|group|
|
510
|
+
num_terms = [terms_per_article * group[:articles], group[:words].length].min
|
511
|
+
terms[factors] ||= []
|
512
|
+
terms[factors] += group[:words][0..(num_terms - 1)]
|
513
|
+
p terms
|
514
|
+
}
|
515
|
+
cccs[factors] = ccc
|
516
|
+
end
|
517
|
+
|
518
|
+
best_k = cccs.sort_by{|p| p[1]}.first[1]
|
519
|
+
|
520
|
+
terms[k]
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
438
524
|
def self.get_genes_nth(dataset, num_genes)
|
439
|
-
path = MARQ.
|
525
|
+
path = MARQ::Dataset.path(dataset)
|
440
526
|
|
441
527
|
experiments = File.open(path + '.experiments').collect{|l| l.chomp.strip}
|
442
528
|
genes = File.open(path + '.codes').collect{|l| l.chomp.strip}
|
@@ -472,7 +558,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
472
558
|
if nth_genes > 0
|
473
559
|
return get_genes_nth(dataset, nth_genes)
|
474
560
|
end
|
475
|
-
|
561
|
+
|
476
562
|
|
477
563
|
path = MARQ.dataset_path(dataset)
|
478
564
|
|
@@ -509,7 +595,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
509
595
|
values_down[name] << (p.last < 0 ? - value : 0)
|
510
596
|
}
|
511
597
|
}
|
512
|
-
|
598
|
+
|
513
599
|
genes_up = {}
|
514
600
|
genes_down = {}
|
515
601
|
|
@@ -545,7 +631,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
545
631
|
|
546
632
|
def self.get_genes_old(dataset, cut_off = 0.1, fdr = false)
|
547
633
|
|
548
|
-
path = MARQ.
|
634
|
+
path = MARQ::Dataset.path(dataset)
|
549
635
|
|
550
636
|
experiments = File.open(path + '.experiments').collect{|l| l.chomp.strip}.select{|name| !name.match(/\[ratio\]/)}
|
551
637
|
genes = File.open(path + '.codes').collect{|l| l.chomp.strip}
|
@@ -598,7 +684,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
598
684
|
def self.OBA(text)
|
599
685
|
|
600
686
|
res = Net::HTTP.post_form(URI.parse('http://rest.bioontology.org/obs_hibernate/annotator'),
|
601
|
-
|
687
|
+
{
|
602
688
|
'longestOnly'=> true,
|
603
689
|
'wholeWordOnly'=> true,
|
604
690
|
'withDefaultStopWords' => true,
|
@@ -626,11 +712,11 @@ end
|
|
626
712
|
if __FILE__ == $0
|
627
713
|
require 'pp'
|
628
714
|
|
629
|
-
|
630
|
-
|
715
|
+
|
716
|
+
|
631
717
|
exit
|
632
718
|
#Annotations::GO::Genecodis::Local.init
|
633
|
-
|
719
|
+
|
634
720
|
#genes = Annotations::GO::get_genes('GDS1916')
|
635
721
|
#genes[:up].each{|exp, genes|
|
636
722
|
# puts exp
|
@@ -647,15 +733,15 @@ if __FILE__ == $0
|
|
647
733
|
and 24 hours following treatment with 4 mg/kg body weight GH. Results provide
|
648
734
|
insight into the insulin-like growth factor-I dependent and independent
|
649
735
|
pathways that mediate the action of GH in bone.
|
650
|
-
|
736
|
+
|
651
737
|
EOT
|
652
738
|
texts << <<-EOT
|
653
|
-
|
739
|
+
|
654
740
|
Comparison of total transcription profiles for temperature-sensitive TOR2
|
655
741
|
mutant strain SH121 to its isogenic wild type counterpart SH100. Results
|
656
742
|
indicate that TOR2 inactivation leads to enhanced transcription of
|
657
743
|
Gcn4-controlled target genes.
|
658
|
-
|
744
|
+
|
659
745
|
|
660
746
|
EOT
|
661
747
|
texts << <<-EOT
|
@@ -664,8 +750,8 @@ if __FILE__ == $0
|
|
664
750
|
melanoma in situ, vertical growth phase (VGP) melanoma, and metastatic growth
|
665
751
|
phase (MGP) melanoma. Results identify expression signatures that distinguish
|
666
752
|
benign and atypical nevi and melanomas in situ from VGPs and MGPs.
|
667
|
-
|
668
|
-
|
753
|
+
|
754
|
+
|
669
755
|
EOT
|
670
756
|
texts << <<-EOT
|
671
757
|
|
@@ -678,7 +764,7 @@ if __FILE__ == $0
|
|
678
764
|
EOT
|
679
765
|
texts << <<-EOT
|
680
766
|
|
681
|
-
|
767
|
+
|
682
768
|
Analysis of anaerobic chemostat cultures of Saccharomyces cerevisae exposed
|
683
769
|
to one of several weak organic acids. Weak organic acids are used as
|
684
770
|
preservatives in food and beverages. Yeasts are able to proliferate at the
|
@@ -721,7 +807,7 @@ if __FILE__ == $0
|
|
721
807
|
key component of host defense. Results suggest most of the host response to
|
722
808
|
endotoxin or live bacteria is actually regulated independently of MyD88.
|
723
809
|
|
724
|
-
|
810
|
+
|
725
811
|
EOT
|
726
812
|
texts.reverse.each{|text|
|
727
813
|
puts "\n\n--------------\n"
|
@@ -732,7 +818,7 @@ if __FILE__ == $0
|
|
732
818
|
puts Annotations::UMLS::OBA(text).join(", ")
|
733
819
|
}
|
734
820
|
|
735
|
-
#
|
821
|
+
#
|
736
822
|
|
737
823
|
#puts Annotations.hypergeometric(2000,100,100,2)
|
738
824
|
#p Annotations::GO::annotate(MARQ.platform_organism('GDS1365'),genes[:up].collect.first.last[1..100])
|