rbbt-marq 2.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/R/CustomDS.R +7 -24
- data/R/GEO.R +1 -21
- data/R/MA.R +253 -223
- data/bin/marq_config +14 -10
- data/install_scripts/CustomDS/Rakefile +1 -1
- data/install_scripts/GEO/Rakefile +2 -1
- data/install_scripts/GEO/series/GSE1814.yaml +44 -0
- data/install_scripts/GEO/series/GSE21.yaml +44 -0
- data/install_scripts/GEO/series/GSE27.yaml +22 -0
- data/install_scripts/GEO/series/GSE5470.yaml +19 -0
- data/install_scripts/rake_includes.rb +22 -5
- data/lib/MARQ/CustomDS.rb +28 -32
- data/lib/MARQ/GEO.rb +77 -91
- data/lib/MARQ/ID.rb +1 -2
- data/lib/MARQ/MADB.rb +31 -25
- data/lib/MARQ/annotations.rb +3 -3
- data/lib/MARQ/main.rb +85 -26
- data/lib/MARQ/rankproduct.rb +14 -8
- metadata +6 -2
data/lib/MARQ/ID.rb
CHANGED
@@ -36,12 +36,11 @@ module ID
|
|
36
36
|
|
37
37
|
other = codes[field]
|
38
38
|
next if other.nil? || other == ""
|
39
|
-
|
40
39
|
|
41
40
|
#codes.collect{|c| c.split("|")}.flatten.compact.select{|c| c != ""}.uniq.each{|code|
|
42
41
|
other.split("|").each{|code|
|
43
42
|
begin
|
44
|
-
DBcache.fast_add(tablename, code.downcase, [native])
|
43
|
+
DBcache.fast_add(tablename, code.strip.downcase, [native])
|
45
44
|
rescue
|
46
45
|
puts $!.message
|
47
46
|
end
|
data/lib/MARQ/MADB.rb
CHANGED
@@ -8,50 +8,57 @@ module MADB
|
|
8
8
|
# {{{ Saving Positions
|
9
9
|
|
10
10
|
# Save the actual data, cross_platform or not
|
11
|
-
def self.save_dataset_instance(dataset
|
12
|
-
dataset += '_cross_platform' if cross_platform
|
13
|
-
prefix = MARQ::Dataset.path(dataset)
|
11
|
+
def self.save_dataset_instance(dataset)
|
14
12
|
|
15
|
-
#
|
16
|
-
codes
|
17
|
-
experiments
|
18
|
-
orders
|
13
|
+
# Get info
|
14
|
+
codes = MARQ::Dataset.codes(dataset);
|
15
|
+
experiments = MARQ::Dataset.experiments(dataset);
|
16
|
+
orders = MARQ::Dataset.orders(dataset).values_at(*experiments).transpose;
|
19
17
|
|
20
18
|
# Save codes and experiments
|
21
19
|
DBcache.save(dataset + '_codes', codes)
|
22
20
|
DBcache.save(dataset + '_experiments', experiments)
|
23
21
|
|
24
|
-
#
|
22
|
+
# Asign orders to codes
|
25
23
|
data = {}
|
26
24
|
codes.each_with_index{|code,i|
|
27
|
-
data[code
|
25
|
+
data[code] = orders[i]
|
28
26
|
}
|
27
|
+
|
28
|
+
# Save orders
|
29
29
|
case
|
30
30
|
when codes.length < 65535
|
31
31
|
type = "SMALLINT UNSIGNED"
|
32
32
|
when codes.length < 16777215
|
33
|
-
type = "
|
33
|
+
type = "MEDIUMINT UNSIGNED"
|
34
34
|
else
|
35
35
|
type = "INT UNSIGNED"
|
36
36
|
end
|
37
|
-
|
38
37
|
DBcache.save(dataset, data, [type] * orders.first.length)
|
38
|
+
|
39
39
|
end
|
40
40
|
|
41
41
|
# Save dataset, all instances, cross_platform if available.
|
42
42
|
def self.save_dataset(dataset)
|
43
|
-
save_dataset_instance(dataset
|
44
|
-
save_dataset_instance(dataset
|
43
|
+
save_dataset_instance(dataset)
|
44
|
+
save_dataset_instance(MARQ::Name.cross_platform(dataset)) if MARQ::Dataset.has_cross_platform?(dataset)
|
45
45
|
nil
|
46
46
|
end
|
47
|
+
|
48
|
+
def self.save_platform_instance(platform)
|
49
|
+
DBcache.save(platform + '_codes',
|
50
|
+
MARQ::Platform.is_cross_platform?(platform) ?
|
51
|
+
MARQ::Platform.cross_platform(platform) :
|
52
|
+
MARQ::Platform.codes(platform))
|
53
|
+
end
|
47
54
|
|
48
55
|
def self.save_platform(platform)
|
49
56
|
datasets = MARQ::Platform.datasets(platform).sort
|
50
57
|
return if datasets.empty?
|
51
58
|
|
52
|
-
|
53
|
-
|
54
|
-
|
59
|
+
save_platform_instance(platform)
|
60
|
+
save_platform_instance(MARQ::Name.cross_platform(platform)) if MARQ::Platform.has_cross_platform?(platform)
|
61
|
+
|
55
62
|
datasets.sort.each do |dataset|
|
56
63
|
save_dataset(dataset)
|
57
64
|
end
|
@@ -60,10 +67,9 @@ module MADB
|
|
60
67
|
# {{{ Loading Positions
|
61
68
|
|
62
69
|
def self.platform_entries(platform)
|
63
|
-
DBcache.num_rows(platform)
|
70
|
+
DBcache.num_rows(platform + '_codes')
|
64
71
|
end
|
65
72
|
|
66
|
-
|
67
73
|
def self.load_positions(dataset, genes, platform_entries)
|
68
74
|
gene_positions = DBcache.load(dataset, genes)
|
69
75
|
data = {}
|
@@ -73,21 +79,21 @@ module MADB
|
|
73
79
|
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
74
80
|
a[0].to_i <=> b[0].to_i
|
75
81
|
}.collect{|p|
|
76
|
-
MARQ::
|
82
|
+
MARQ::Name.clean(dataset) + ": " + p[1].first
|
77
83
|
}
|
78
84
|
|
79
85
|
# Get scale factors (to account for genes missing in the dataset)
|
80
86
|
scale = (0..experiments.length - 1).collect{|i|
|
81
87
|
rows = DBcache.num_rows(dataset, "C#{i}");
|
82
88
|
if rows > 0
|
83
|
-
platform_entries / rows
|
89
|
+
platform_entries.to_f / rows
|
84
90
|
else
|
85
91
|
nil
|
86
92
|
end
|
87
93
|
}
|
88
94
|
|
89
95
|
# Get experiment positions and scale them
|
90
|
-
experiment_x_gene = gene_positions.
|
96
|
+
experiment_x_gene = gene_positions.values_at(*matched).transpose
|
91
97
|
experiments.each_with_index{|experiment, i|
|
92
98
|
next if scale[i].nil? || experiment_x_gene[i].nil?
|
93
99
|
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
@@ -100,8 +106,8 @@ module MADB
|
|
100
106
|
def self.dataset_positions(dataset, genes)
|
101
107
|
return [{},[],0] if genes.empty?
|
102
108
|
|
103
|
-
genes = genes.collect{|gene| gene.downcase.strip}
|
104
|
-
platform_entries = platform_entries(dataset
|
109
|
+
genes = genes.collect{|gene| gene.to_s.downcase.strip}
|
110
|
+
platform_entries = platform_entries(dataset)
|
105
111
|
|
106
112
|
load_positions(dataset, genes, platform_entries)
|
107
113
|
end
|
@@ -111,7 +117,7 @@ module MADB
|
|
111
117
|
return [{},[],0] if genes.empty?
|
112
118
|
|
113
119
|
genes = genes.collect {|gene| gene.downcase.strip }
|
114
|
-
platform_entries = platform_entries(platform)
|
120
|
+
platform_entries = platform_entries(platform)
|
115
121
|
|
116
122
|
cross_platform = MARQ::Platform.is_cross_platform? platform
|
117
123
|
datasets = MARQ::Platform.datasets(platform).sort
|
@@ -120,7 +126,7 @@ module MADB
|
|
120
126
|
total_matched = []
|
121
127
|
|
122
128
|
datasets.each do |dataset|
|
123
|
-
dataset
|
129
|
+
dataset = MARQ::Name.cross_platform dataset if cross_platform
|
124
130
|
data, matched = load_positions(dataset, genes, platform_entries)
|
125
131
|
total_data = total_data.merge(data)
|
126
132
|
total_matched += matched
|
data/lib/MARQ/annotations.rb
CHANGED
@@ -299,7 +299,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
299
299
|
def self.dataset_annotations(dataset, type, experiment)
|
300
300
|
annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
|
301
301
|
|
302
|
-
term_file = File.join(annotation_dir, type, MARQ::
|
302
|
+
term_file = File.join(annotation_dir, type, MARQ::Name.clean(dataset))
|
303
303
|
|
304
304
|
if File.exist? term_file
|
305
305
|
@@terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
@@ -331,9 +331,9 @@ double hypergeometric(double total, double support, double list, double found)
|
|
331
331
|
when side.nil?
|
332
332
|
experiment_type = type
|
333
333
|
when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
|
334
|
-
experiment_type
|
334
|
+
experiment_type = type + '_up'
|
335
335
|
else
|
336
|
-
experiment_type
|
336
|
+
experiment_type = type + '_down'
|
337
337
|
end
|
338
338
|
|
339
339
|
annot[experiment] = dataset_annotations(dataset, experiment_type, name)
|
data/lib/MARQ/main.rb
CHANGED
@@ -3,21 +3,39 @@ require 'MARQ/MADB'
|
|
3
3
|
require 'MARQ/score'
|
4
4
|
|
5
5
|
module MARQ
|
6
|
+
module Name
|
7
|
+
def self.clean(name)
|
8
|
+
name.sub(/_cross_platform/,'') unless name.nil?
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.cross_platform(name)
|
12
|
+
if name =~ /_cross_platform/
|
13
|
+
name
|
14
|
+
else
|
15
|
+
name + "_cross_platform"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.is_cross_platform?(name)
|
20
|
+
! name.match(/_cross_platform$/).nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.is_ratio?(name)
|
24
|
+
! name.match(/\[ratio\]$/).nil?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
6
28
|
module Platform
|
7
29
|
def self.is_GEO?(platform)
|
8
30
|
! platform.match(/^GPL/).nil?
|
9
31
|
end
|
10
32
|
|
11
33
|
def self.is_cross_platform?(platform)
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.clean(name)
|
16
|
-
name.sub(/_cross_platform/,'') unless name.nil?
|
34
|
+
MARQ::Name.is_cross_platform? platform
|
17
35
|
end
|
18
36
|
|
19
37
|
def self.path(platform)
|
20
|
-
platform = clean(platform)
|
38
|
+
platform = MARQ::Name.clean(platform)
|
21
39
|
if is_GEO? platform
|
22
40
|
GEO.platform_path(platform)
|
23
41
|
else
|
@@ -25,6 +43,10 @@ module MARQ
|
|
25
43
|
end
|
26
44
|
end
|
27
45
|
|
46
|
+
def self.exists?(platform)
|
47
|
+
path(platform) != nil
|
48
|
+
end
|
49
|
+
|
28
50
|
def self.has_cross_platform?(platform)
|
29
51
|
File.exists? File.join(path(platform), 'cross_platform')
|
30
52
|
end
|
@@ -38,17 +60,17 @@ module MARQ
|
|
38
60
|
end
|
39
61
|
|
40
62
|
def self.codes(platform)
|
41
|
-
platform = clean(platform)
|
63
|
+
platform = MARQ::Name.clean(platform)
|
42
64
|
Open.read(File.join(path(platform), 'codes')).scan(/[^\s]+/)
|
43
65
|
end
|
44
66
|
|
45
67
|
def self.cross_platform(platform)
|
46
|
-
platform = clean(platform)
|
68
|
+
platform = MARQ::Name.clean(platform)
|
47
69
|
Open.read(File.join(path(platform), 'cross_platform')).scan(/[^\s]+/)
|
48
70
|
end
|
49
71
|
|
50
72
|
def self.organism(platform)
|
51
|
-
platform = clean(platform)
|
73
|
+
platform = MARQ::Name.clean(platform)
|
52
74
|
if is_GEO? platform
|
53
75
|
GEO.platform_organism platform
|
54
76
|
else
|
@@ -57,7 +79,7 @@ module MARQ
|
|
57
79
|
end
|
58
80
|
|
59
81
|
def self.process(platform)
|
60
|
-
platform = clean(platform)
|
82
|
+
platform = MARQ::Name.clean(platform)
|
61
83
|
if is_GEO? platform
|
62
84
|
GEO.process_platform(platform)
|
63
85
|
else
|
@@ -79,10 +101,6 @@ module MARQ
|
|
79
101
|
! dataset.match(/^(?:GDS|GSE)/).nil?
|
80
102
|
end
|
81
103
|
|
82
|
-
def self.clean(name)
|
83
|
-
name.sub(/_cross_platform/,'') if name
|
84
|
-
end
|
85
|
-
|
86
104
|
def self.path(platform)
|
87
105
|
if is_GEO? platform
|
88
106
|
GEO.dataset_path(platform)
|
@@ -92,21 +110,34 @@ module MARQ
|
|
92
110
|
end
|
93
111
|
|
94
112
|
def self.exists?(dataset)
|
95
|
-
|
113
|
+
path = path(dataset)
|
114
|
+
if path.nil?
|
115
|
+
return false
|
116
|
+
else
|
117
|
+
return File.exists?(path + '.orders')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.broken?(dataset)
|
122
|
+
path = path(dataset)
|
123
|
+
|
124
|
+
return false if path.nil?
|
125
|
+
|
126
|
+
if File.exists?(path + '.skip')
|
127
|
+
return true
|
128
|
+
else
|
129
|
+
return false
|
130
|
+
end
|
96
131
|
end
|
97
132
|
|
98
133
|
def self.is_cross_platform?(dataset)
|
99
|
-
|
134
|
+
MARQ::Name.is_cross_platform? dataset
|
100
135
|
end
|
101
136
|
|
102
137
|
def self.has_cross_platform?(dataset)
|
103
138
|
File.exists?(path(dataset) + '_cross_platform.orders')
|
104
139
|
end
|
105
140
|
|
106
|
-
def self.exists?(dataset)
|
107
|
-
path(dataset) != nil
|
108
|
-
end
|
109
|
-
|
110
141
|
def self.info(name)
|
111
142
|
begin
|
112
143
|
title, description = Open.read(path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
@@ -159,10 +190,12 @@ module MARQ
|
|
159
190
|
end
|
160
191
|
|
161
192
|
def self.read_values_t(dataset, file)
|
193
|
+
experiments = experiments(dataset).reject{|experiment| MARQ::Name.is_ratio? experiment }
|
194
|
+
|
195
|
+
return {} if experiments.empty?
|
196
|
+
|
162
197
|
result = {}
|
163
198
|
|
164
|
-
experiments = experiments(dataset).select{|experiment| experiment !~ /\[ratio\]$/ }
|
165
|
-
return {} if experiments.empty?
|
166
199
|
experiments.each{|experiment| result[experiment] = [] }
|
167
200
|
|
168
201
|
read_file(dataset, file).split(/\n/).each do |line|
|
@@ -175,7 +208,7 @@ module MARQ
|
|
175
208
|
|
176
209
|
|
177
210
|
def self.experiments(dataset)
|
178
|
-
read_file(dataset, 'experiments').split(/\n/)
|
211
|
+
read_file(dataset, 'experiments').split(/\n/).collect{|exp| exp.strip }
|
179
212
|
end
|
180
213
|
|
181
214
|
def self.codes(dataset)
|
@@ -198,29 +231,40 @@ module MARQ
|
|
198
231
|
read_values_t(dataset, 't')
|
199
232
|
end
|
200
233
|
|
234
|
+
def self.codes_for(dataset, type, experiment)
|
235
|
+
codes = codes(dataset)
|
236
|
+
values = send(type, dataset)[experiment]
|
237
|
+
Hash[*codes.zip(values).reject{|p| p.last.nil? }.flatten]
|
238
|
+
end
|
239
|
+
|
201
240
|
end
|
202
241
|
|
203
242
|
module RankQuery
|
204
243
|
def self.complete_positions(positions, matched, genes)
|
244
|
+
matched = matched.collect{|gene| gene.strip.downcase}
|
245
|
+
genes = genes.collect{|gene| gene.strip.downcase}
|
246
|
+
|
205
247
|
pos = Hash[*matched.zip(positions).flatten]
|
248
|
+
|
206
249
|
complete = genes.collect{|gene|
|
207
|
-
gene = gene.downcase.strip
|
208
250
|
if matched.include? gene
|
209
251
|
pos[gene] || "MISSING"
|
210
252
|
else
|
211
253
|
"NOT IN PLATFORM"
|
212
254
|
end
|
213
255
|
}
|
256
|
+
complete
|
214
257
|
end
|
215
258
|
|
216
259
|
|
217
260
|
def self.position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
|
218
261
|
scores = []
|
262
|
+
|
219
263
|
positions_up.keys.each do |experiment|
|
220
264
|
score = Score.score_up_down(positions_up[experiment], positions_down[experiment], platform_entries, missing_up, missing_down)
|
221
265
|
score[:total_entries] = platform_entries
|
222
|
-
score[:positions_up] = complete_positions(positions_up[experiment], matched_up, up) if up.any?
|
223
|
-
score[:positions_down] = complete_positions(positions_down[experiment], matched_down, down) if down.any?
|
266
|
+
score[:positions_up] = complete_positions(positions_up[experiment] || [], matched_up, up) if up.any?
|
267
|
+
score[:positions_down] = complete_positions(positions_down[experiment] || [], matched_down, down) if down.any?
|
224
268
|
scores << score
|
225
269
|
end
|
226
270
|
|
@@ -248,12 +292,27 @@ module MARQ
|
|
248
292
|
positions_up, matched_up, platform_entries = MADB.platform_positions(platform, up)
|
249
293
|
missing_up = up.length - matched_up.length
|
250
294
|
|
295
|
+
|
251
296
|
positions_down, matched_down = MADB.platform_positions(platform, down)
|
252
297
|
missing_down = down.length - matched_down.length
|
253
298
|
|
254
299
|
position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
|
255
300
|
end
|
256
301
|
|
302
|
+
def self.organism_scores(organism, up, down)
|
303
|
+
platforms = MARQ::Platform.organism_platforms(organism).
|
304
|
+
select {|p| MARQ::Platform.has_cross_platform? p }.
|
305
|
+
collect {|p| MARQ::Name.cross_platform p }
|
306
|
+
|
307
|
+
total_scores = {}
|
308
|
+
platforms.each do |platform|
|
309
|
+
scores = platform_scores(platform, up, down)
|
310
|
+
total_scores.merge!(scores)
|
311
|
+
end
|
312
|
+
|
313
|
+
total_scores
|
314
|
+
end
|
315
|
+
|
257
316
|
end
|
258
317
|
end
|
259
318
|
|
data/lib/MARQ/rankproduct.rb
CHANGED
@@ -17,7 +17,7 @@ module RankProduct
|
|
17
17
|
orders = MARQ::Dataset.orders(dataset)[experiment.strip]
|
18
18
|
|
19
19
|
if invert
|
20
|
-
num_genes =
|
20
|
+
num_genes = codes.length + 1
|
21
21
|
orders.collect! {|pos| pos.nil? ? nil : num_genes - pos }
|
22
22
|
end
|
23
23
|
|
@@ -31,7 +31,7 @@ module RankProduct
|
|
31
31
|
log_sizes = signature_sizes.collect{|size| Math::log(size)}
|
32
32
|
gene_ranks.each{|gene, positions|
|
33
33
|
scores[gene] = positions.zip(log_sizes).
|
34
|
-
collect{|p| Math::log(p[0]) - p[1]}.
|
34
|
+
collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
|
35
35
|
inject(0){|acc, v| acc += v }
|
36
36
|
}
|
37
37
|
scores
|
@@ -70,6 +70,7 @@ module RankProduct
|
|
70
70
|
:cross_platform => false,
|
71
71
|
}.merge(options).values_at(:invert, :from_FC, :cross_platform)
|
72
72
|
|
73
|
+
# Gather gene ranks from signatures
|
73
74
|
ranks = {}
|
74
75
|
signatures.each{|signature|
|
75
76
|
dataset, experiment = signature.match(/^([^\:]*): (.*)/).values_at(1,2)
|
@@ -77,6 +78,7 @@ module RankProduct
|
|
77
78
|
ranks[signature] = self.ranks(dataset, experiment, from_FC, invert.include?(signature))
|
78
79
|
}
|
79
80
|
|
81
|
+
# Invert the hash, from signature keys to gene keys
|
80
82
|
gene_ranks = {}
|
81
83
|
sizes = []
|
82
84
|
ranks.each{|signature, orders|
|
@@ -88,32 +90,36 @@ module RankProduct
|
|
88
90
|
}
|
89
91
|
}
|
90
92
|
|
93
|
+
# Remove incomplete genes
|
91
94
|
gene_ranks.delete_if{|code, positions| positions.length != signatures.uniq.length}
|
92
95
|
|
96
|
+
# Compute scores
|
93
97
|
scores = score(gene_ranks, sizes)
|
94
|
-
num_permutations = 50000
|
95
98
|
|
99
|
+
# Compute permutations
|
100
|
+
num_permutations = 50000
|
96
101
|
permutation_scores = permutations(sizes.length, num_permutations)
|
97
|
-
|
98
102
|
permutation_scores = permutation_scores.sort
|
99
103
|
|
100
104
|
|
105
|
+
# Compute p-values from permutations
|
101
106
|
results = {}
|
102
|
-
scores.each{|gene, score|
|
107
|
+
scores.each {|gene, score|
|
103
108
|
pos = permutation_scores.count_smaller(score)
|
104
109
|
results[gene] = [score, pos.to_f / num_permutations]
|
105
110
|
}
|
106
111
|
|
107
|
-
|
112
|
+
# Complete the information with pfp
|
108
113
|
num_genes = results.length
|
109
|
-
results.sort{|a,b|
|
114
|
+
results.sort {|a,b|
|
110
115
|
a[1][0] <=> b[1][0]
|
111
116
|
}.each_with_index{|p,i|
|
112
|
-
gene = p[0]
|
113
117
|
info = p[1]
|
114
118
|
pvalue = info[1]
|
119
|
+
|
115
120
|
pfp = pvalue * num_genes / (i + 1)
|
116
121
|
info << pfp
|
122
|
+
|
117
123
|
}
|
118
124
|
|
119
125
|
results
|