rbbt-marq 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/R/CustomDS.R +7 -24
- data/R/GEO.R +1 -21
- data/R/MA.R +253 -223
- data/bin/marq_config +14 -10
- data/install_scripts/CustomDS/Rakefile +1 -1
- data/install_scripts/GEO/Rakefile +2 -1
- data/install_scripts/GEO/series/GSE1814.yaml +44 -0
- data/install_scripts/GEO/series/GSE21.yaml +44 -0
- data/install_scripts/GEO/series/GSE27.yaml +22 -0
- data/install_scripts/GEO/series/GSE5470.yaml +19 -0
- data/install_scripts/rake_includes.rb +22 -5
- data/lib/MARQ/CustomDS.rb +28 -32
- data/lib/MARQ/GEO.rb +77 -91
- data/lib/MARQ/ID.rb +1 -2
- data/lib/MARQ/MADB.rb +31 -25
- data/lib/MARQ/annotations.rb +3 -3
- data/lib/MARQ/main.rb +85 -26
- data/lib/MARQ/rankproduct.rb +14 -8
- metadata +6 -2
data/lib/MARQ/ID.rb
CHANGED
@@ -36,12 +36,11 @@ module ID
|
|
36
36
|
|
37
37
|
other = codes[field]
|
38
38
|
next if other.nil? || other == ""
|
39
|
-
|
40
39
|
|
41
40
|
#codes.collect{|c| c.split("|")}.flatten.compact.select{|c| c != ""}.uniq.each{|code|
|
42
41
|
other.split("|").each{|code|
|
43
42
|
begin
|
44
|
-
DBcache.fast_add(tablename, code.downcase, [native])
|
43
|
+
DBcache.fast_add(tablename, code.strip.downcase, [native])
|
45
44
|
rescue
|
46
45
|
puts $!.message
|
47
46
|
end
|
data/lib/MARQ/MADB.rb
CHANGED
@@ -8,50 +8,57 @@ module MADB
|
|
8
8
|
# {{{ Saving Positions
|
9
9
|
|
10
10
|
# Save the actual data, cross_platform or not
|
11
|
-
def self.save_dataset_instance(dataset
|
12
|
-
dataset += '_cross_platform' if cross_platform
|
13
|
-
prefix = MARQ::Dataset.path(dataset)
|
11
|
+
def self.save_dataset_instance(dataset)
|
14
12
|
|
15
|
-
#
|
16
|
-
codes
|
17
|
-
experiments
|
18
|
-
orders
|
13
|
+
# Get info
|
14
|
+
codes = MARQ::Dataset.codes(dataset);
|
15
|
+
experiments = MARQ::Dataset.experiments(dataset);
|
16
|
+
orders = MARQ::Dataset.orders(dataset).values_at(*experiments).transpose;
|
19
17
|
|
20
18
|
# Save codes and experiments
|
21
19
|
DBcache.save(dataset + '_codes', codes)
|
22
20
|
DBcache.save(dataset + '_experiments', experiments)
|
23
21
|
|
24
|
-
#
|
22
|
+
# Asign orders to codes
|
25
23
|
data = {}
|
26
24
|
codes.each_with_index{|code,i|
|
27
|
-
data[code
|
25
|
+
data[code] = orders[i]
|
28
26
|
}
|
27
|
+
|
28
|
+
# Save orders
|
29
29
|
case
|
30
30
|
when codes.length < 65535
|
31
31
|
type = "SMALLINT UNSIGNED"
|
32
32
|
when codes.length < 16777215
|
33
|
-
type = "
|
33
|
+
type = "MEDIUMINT UNSIGNED"
|
34
34
|
else
|
35
35
|
type = "INT UNSIGNED"
|
36
36
|
end
|
37
|
-
|
38
37
|
DBcache.save(dataset, data, [type] * orders.first.length)
|
38
|
+
|
39
39
|
end
|
40
40
|
|
41
41
|
# Save dataset, all instances, cross_platform if available.
|
42
42
|
def self.save_dataset(dataset)
|
43
|
-
save_dataset_instance(dataset
|
44
|
-
save_dataset_instance(dataset
|
43
|
+
save_dataset_instance(dataset)
|
44
|
+
save_dataset_instance(MARQ::Name.cross_platform(dataset)) if MARQ::Dataset.has_cross_platform?(dataset)
|
45
45
|
nil
|
46
46
|
end
|
47
|
+
|
48
|
+
def self.save_platform_instance(platform)
|
49
|
+
DBcache.save(platform + '_codes',
|
50
|
+
MARQ::Platform.is_cross_platform?(platform) ?
|
51
|
+
MARQ::Platform.cross_platform(platform) :
|
52
|
+
MARQ::Platform.codes(platform))
|
53
|
+
end
|
47
54
|
|
48
55
|
def self.save_platform(platform)
|
49
56
|
datasets = MARQ::Platform.datasets(platform).sort
|
50
57
|
return if datasets.empty?
|
51
58
|
|
52
|
-
|
53
|
-
|
54
|
-
|
59
|
+
save_platform_instance(platform)
|
60
|
+
save_platform_instance(MARQ::Name.cross_platform(platform)) if MARQ::Platform.has_cross_platform?(platform)
|
61
|
+
|
55
62
|
datasets.sort.each do |dataset|
|
56
63
|
save_dataset(dataset)
|
57
64
|
end
|
@@ -60,10 +67,9 @@ module MADB
|
|
60
67
|
# {{{ Loading Positions
|
61
68
|
|
62
69
|
def self.platform_entries(platform)
|
63
|
-
DBcache.num_rows(platform)
|
70
|
+
DBcache.num_rows(platform + '_codes')
|
64
71
|
end
|
65
72
|
|
66
|
-
|
67
73
|
def self.load_positions(dataset, genes, platform_entries)
|
68
74
|
gene_positions = DBcache.load(dataset, genes)
|
69
75
|
data = {}
|
@@ -73,21 +79,21 @@ module MADB
|
|
73
79
|
experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
|
74
80
|
a[0].to_i <=> b[0].to_i
|
75
81
|
}.collect{|p|
|
76
|
-
MARQ::
|
82
|
+
MARQ::Name.clean(dataset) + ": " + p[1].first
|
77
83
|
}
|
78
84
|
|
79
85
|
# Get scale factors (to account for genes missing in the dataset)
|
80
86
|
scale = (0..experiments.length - 1).collect{|i|
|
81
87
|
rows = DBcache.num_rows(dataset, "C#{i}");
|
82
88
|
if rows > 0
|
83
|
-
platform_entries / rows
|
89
|
+
platform_entries.to_f / rows
|
84
90
|
else
|
85
91
|
nil
|
86
92
|
end
|
87
93
|
}
|
88
94
|
|
89
95
|
# Get experiment positions and scale them
|
90
|
-
experiment_x_gene = gene_positions.
|
96
|
+
experiment_x_gene = gene_positions.values_at(*matched).transpose
|
91
97
|
experiments.each_with_index{|experiment, i|
|
92
98
|
next if scale[i].nil? || experiment_x_gene[i].nil?
|
93
99
|
values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
|
@@ -100,8 +106,8 @@ module MADB
|
|
100
106
|
def self.dataset_positions(dataset, genes)
|
101
107
|
return [{},[],0] if genes.empty?
|
102
108
|
|
103
|
-
genes = genes.collect{|gene| gene.downcase.strip}
|
104
|
-
platform_entries = platform_entries(dataset
|
109
|
+
genes = genes.collect{|gene| gene.to_s.downcase.strip}
|
110
|
+
platform_entries = platform_entries(dataset)
|
105
111
|
|
106
112
|
load_positions(dataset, genes, platform_entries)
|
107
113
|
end
|
@@ -111,7 +117,7 @@ module MADB
|
|
111
117
|
return [{},[],0] if genes.empty?
|
112
118
|
|
113
119
|
genes = genes.collect {|gene| gene.downcase.strip }
|
114
|
-
platform_entries = platform_entries(platform)
|
120
|
+
platform_entries = platform_entries(platform)
|
115
121
|
|
116
122
|
cross_platform = MARQ::Platform.is_cross_platform? platform
|
117
123
|
datasets = MARQ::Platform.datasets(platform).sort
|
@@ -120,7 +126,7 @@ module MADB
|
|
120
126
|
total_matched = []
|
121
127
|
|
122
128
|
datasets.each do |dataset|
|
123
|
-
dataset
|
129
|
+
dataset = MARQ::Name.cross_platform dataset if cross_platform
|
124
130
|
data, matched = load_positions(dataset, genes, platform_entries)
|
125
131
|
total_data = total_data.merge(data)
|
126
132
|
total_matched += matched
|
data/lib/MARQ/annotations.rb
CHANGED
@@ -299,7 +299,7 @@ double hypergeometric(double total, double support, double list, double found)
|
|
299
299
|
def self.dataset_annotations(dataset, type, experiment)
|
300
300
|
annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
|
301
301
|
|
302
|
-
term_file = File.join(annotation_dir, type, MARQ::
|
302
|
+
term_file = File.join(annotation_dir, type, MARQ::Name.clean(dataset))
|
303
303
|
|
304
304
|
if File.exist? term_file
|
305
305
|
@@terms_cache[term_file] ||= YAML::load(File.open(term_file))
|
@@ -331,9 +331,9 @@ double hypergeometric(double total, double support, double list, double found)
|
|
331
331
|
when side.nil?
|
332
332
|
experiment_type = type
|
333
333
|
when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
|
334
|
-
experiment_type
|
334
|
+
experiment_type = type + '_up'
|
335
335
|
else
|
336
|
-
experiment_type
|
336
|
+
experiment_type = type + '_down'
|
337
337
|
end
|
338
338
|
|
339
339
|
annot[experiment] = dataset_annotations(dataset, experiment_type, name)
|
data/lib/MARQ/main.rb
CHANGED
@@ -3,21 +3,39 @@ require 'MARQ/MADB'
|
|
3
3
|
require 'MARQ/score'
|
4
4
|
|
5
5
|
module MARQ
|
6
|
+
module Name
|
7
|
+
def self.clean(name)
|
8
|
+
name.sub(/_cross_platform/,'') unless name.nil?
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.cross_platform(name)
|
12
|
+
if name =~ /_cross_platform/
|
13
|
+
name
|
14
|
+
else
|
15
|
+
name + "_cross_platform"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.is_cross_platform?(name)
|
20
|
+
! name.match(/_cross_platform$/).nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.is_ratio?(name)
|
24
|
+
! name.match(/\[ratio\]$/).nil?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
6
28
|
module Platform
|
7
29
|
def self.is_GEO?(platform)
|
8
30
|
! platform.match(/^GPL/).nil?
|
9
31
|
end
|
10
32
|
|
11
33
|
def self.is_cross_platform?(platform)
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.clean(name)
|
16
|
-
name.sub(/_cross_platform/,'') unless name.nil?
|
34
|
+
MARQ::Name.is_cross_platform? platform
|
17
35
|
end
|
18
36
|
|
19
37
|
def self.path(platform)
|
20
|
-
platform = clean(platform)
|
38
|
+
platform = MARQ::Name.clean(platform)
|
21
39
|
if is_GEO? platform
|
22
40
|
GEO.platform_path(platform)
|
23
41
|
else
|
@@ -25,6 +43,10 @@ module MARQ
|
|
25
43
|
end
|
26
44
|
end
|
27
45
|
|
46
|
+
def self.exists?(platform)
|
47
|
+
path(platform) != nil
|
48
|
+
end
|
49
|
+
|
28
50
|
def self.has_cross_platform?(platform)
|
29
51
|
File.exists? File.join(path(platform), 'cross_platform')
|
30
52
|
end
|
@@ -38,17 +60,17 @@ module MARQ
|
|
38
60
|
end
|
39
61
|
|
40
62
|
def self.codes(platform)
|
41
|
-
platform = clean(platform)
|
63
|
+
platform = MARQ::Name.clean(platform)
|
42
64
|
Open.read(File.join(path(platform), 'codes')).scan(/[^\s]+/)
|
43
65
|
end
|
44
66
|
|
45
67
|
def self.cross_platform(platform)
|
46
|
-
platform = clean(platform)
|
68
|
+
platform = MARQ::Name.clean(platform)
|
47
69
|
Open.read(File.join(path(platform), 'cross_platform')).scan(/[^\s]+/)
|
48
70
|
end
|
49
71
|
|
50
72
|
def self.organism(platform)
|
51
|
-
platform = clean(platform)
|
73
|
+
platform = MARQ::Name.clean(platform)
|
52
74
|
if is_GEO? platform
|
53
75
|
GEO.platform_organism platform
|
54
76
|
else
|
@@ -57,7 +79,7 @@ module MARQ
|
|
57
79
|
end
|
58
80
|
|
59
81
|
def self.process(platform)
|
60
|
-
platform = clean(platform)
|
82
|
+
platform = MARQ::Name.clean(platform)
|
61
83
|
if is_GEO? platform
|
62
84
|
GEO.process_platform(platform)
|
63
85
|
else
|
@@ -79,10 +101,6 @@ module MARQ
|
|
79
101
|
! dataset.match(/^(?:GDS|GSE)/).nil?
|
80
102
|
end
|
81
103
|
|
82
|
-
def self.clean(name)
|
83
|
-
name.sub(/_cross_platform/,'') if name
|
84
|
-
end
|
85
|
-
|
86
104
|
def self.path(platform)
|
87
105
|
if is_GEO? platform
|
88
106
|
GEO.dataset_path(platform)
|
@@ -92,21 +110,34 @@ module MARQ
|
|
92
110
|
end
|
93
111
|
|
94
112
|
def self.exists?(dataset)
|
95
|
-
|
113
|
+
path = path(dataset)
|
114
|
+
if path.nil?
|
115
|
+
return false
|
116
|
+
else
|
117
|
+
return File.exists?(path + '.orders')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.broken?(dataset)
|
122
|
+
path = path(dataset)
|
123
|
+
|
124
|
+
return false if path.nil?
|
125
|
+
|
126
|
+
if File.exists?(path + '.skip')
|
127
|
+
return true
|
128
|
+
else
|
129
|
+
return false
|
130
|
+
end
|
96
131
|
end
|
97
132
|
|
98
133
|
def self.is_cross_platform?(dataset)
|
99
|
-
|
134
|
+
MARQ::Name.is_cross_platform? dataset
|
100
135
|
end
|
101
136
|
|
102
137
|
def self.has_cross_platform?(dataset)
|
103
138
|
File.exists?(path(dataset) + '_cross_platform.orders')
|
104
139
|
end
|
105
140
|
|
106
|
-
def self.exists?(dataset)
|
107
|
-
path(dataset) != nil
|
108
|
-
end
|
109
|
-
|
110
141
|
def self.info(name)
|
111
142
|
begin
|
112
143
|
title, description = Open.read(path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
@@ -159,10 +190,12 @@ module MARQ
|
|
159
190
|
end
|
160
191
|
|
161
192
|
def self.read_values_t(dataset, file)
|
193
|
+
experiments = experiments(dataset).reject{|experiment| MARQ::Name.is_ratio? experiment }
|
194
|
+
|
195
|
+
return {} if experiments.empty?
|
196
|
+
|
162
197
|
result = {}
|
163
198
|
|
164
|
-
experiments = experiments(dataset).select{|experiment| experiment !~ /\[ratio\]$/ }
|
165
|
-
return {} if experiments.empty?
|
166
199
|
experiments.each{|experiment| result[experiment] = [] }
|
167
200
|
|
168
201
|
read_file(dataset, file).split(/\n/).each do |line|
|
@@ -175,7 +208,7 @@ module MARQ
|
|
175
208
|
|
176
209
|
|
177
210
|
def self.experiments(dataset)
|
178
|
-
read_file(dataset, 'experiments').split(/\n/)
|
211
|
+
read_file(dataset, 'experiments').split(/\n/).collect{|exp| exp.strip }
|
179
212
|
end
|
180
213
|
|
181
214
|
def self.codes(dataset)
|
@@ -198,29 +231,40 @@ module MARQ
|
|
198
231
|
read_values_t(dataset, 't')
|
199
232
|
end
|
200
233
|
|
234
|
+
def self.codes_for(dataset, type, experiment)
|
235
|
+
codes = codes(dataset)
|
236
|
+
values = send(type, dataset)[experiment]
|
237
|
+
Hash[*codes.zip(values).reject{|p| p.last.nil? }.flatten]
|
238
|
+
end
|
239
|
+
|
201
240
|
end
|
202
241
|
|
203
242
|
module RankQuery
|
204
243
|
def self.complete_positions(positions, matched, genes)
|
244
|
+
matched = matched.collect{|gene| gene.strip.downcase}
|
245
|
+
genes = genes.collect{|gene| gene.strip.downcase}
|
246
|
+
|
205
247
|
pos = Hash[*matched.zip(positions).flatten]
|
248
|
+
|
206
249
|
complete = genes.collect{|gene|
|
207
|
-
gene = gene.downcase.strip
|
208
250
|
if matched.include? gene
|
209
251
|
pos[gene] || "MISSING"
|
210
252
|
else
|
211
253
|
"NOT IN PLATFORM"
|
212
254
|
end
|
213
255
|
}
|
256
|
+
complete
|
214
257
|
end
|
215
258
|
|
216
259
|
|
217
260
|
def self.position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
|
218
261
|
scores = []
|
262
|
+
|
219
263
|
positions_up.keys.each do |experiment|
|
220
264
|
score = Score.score_up_down(positions_up[experiment], positions_down[experiment], platform_entries, missing_up, missing_down)
|
221
265
|
score[:total_entries] = platform_entries
|
222
|
-
score[:positions_up] = complete_positions(positions_up[experiment], matched_up, up) if up.any?
|
223
|
-
score[:positions_down] = complete_positions(positions_down[experiment], matched_down, down) if down.any?
|
266
|
+
score[:positions_up] = complete_positions(positions_up[experiment] || [], matched_up, up) if up.any?
|
267
|
+
score[:positions_down] = complete_positions(positions_down[experiment] || [], matched_down, down) if down.any?
|
224
268
|
scores << score
|
225
269
|
end
|
226
270
|
|
@@ -248,12 +292,27 @@ module MARQ
|
|
248
292
|
positions_up, matched_up, platform_entries = MADB.platform_positions(platform, up)
|
249
293
|
missing_up = up.length - matched_up.length
|
250
294
|
|
295
|
+
|
251
296
|
positions_down, matched_down = MADB.platform_positions(platform, down)
|
252
297
|
missing_down = down.length - matched_down.length
|
253
298
|
|
254
299
|
position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
|
255
300
|
end
|
256
301
|
|
302
|
+
def self.organism_scores(organism, up, down)
|
303
|
+
platforms = MARQ::Platform.organism_platforms(organism).
|
304
|
+
select {|p| MARQ::Platform.has_cross_platform? p }.
|
305
|
+
collect {|p| MARQ::Name.cross_platform p }
|
306
|
+
|
307
|
+
total_scores = {}
|
308
|
+
platforms.each do |platform|
|
309
|
+
scores = platform_scores(platform, up, down)
|
310
|
+
total_scores.merge!(scores)
|
311
|
+
end
|
312
|
+
|
313
|
+
total_scores
|
314
|
+
end
|
315
|
+
|
257
316
|
end
|
258
317
|
end
|
259
318
|
|
data/lib/MARQ/rankproduct.rb
CHANGED
@@ -17,7 +17,7 @@ module RankProduct
|
|
17
17
|
orders = MARQ::Dataset.orders(dataset)[experiment.strip]
|
18
18
|
|
19
19
|
if invert
|
20
|
-
num_genes =
|
20
|
+
num_genes = codes.length + 1
|
21
21
|
orders.collect! {|pos| pos.nil? ? nil : num_genes - pos }
|
22
22
|
end
|
23
23
|
|
@@ -31,7 +31,7 @@ module RankProduct
|
|
31
31
|
log_sizes = signature_sizes.collect{|size| Math::log(size)}
|
32
32
|
gene_ranks.each{|gene, positions|
|
33
33
|
scores[gene] = positions.zip(log_sizes).
|
34
|
-
collect{|p| Math::log(p[0]) - p[1]}.
|
34
|
+
collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
|
35
35
|
inject(0){|acc, v| acc += v }
|
36
36
|
}
|
37
37
|
scores
|
@@ -70,6 +70,7 @@ module RankProduct
|
|
70
70
|
:cross_platform => false,
|
71
71
|
}.merge(options).values_at(:invert, :from_FC, :cross_platform)
|
72
72
|
|
73
|
+
# Gather gene ranks from signatures
|
73
74
|
ranks = {}
|
74
75
|
signatures.each{|signature|
|
75
76
|
dataset, experiment = signature.match(/^([^\:]*): (.*)/).values_at(1,2)
|
@@ -77,6 +78,7 @@ module RankProduct
|
|
77
78
|
ranks[signature] = self.ranks(dataset, experiment, from_FC, invert.include?(signature))
|
78
79
|
}
|
79
80
|
|
81
|
+
# Invert the hash, from signature keys to gene keys
|
80
82
|
gene_ranks = {}
|
81
83
|
sizes = []
|
82
84
|
ranks.each{|signature, orders|
|
@@ -88,32 +90,36 @@ module RankProduct
|
|
88
90
|
}
|
89
91
|
}
|
90
92
|
|
93
|
+
# Remove incomplete genes
|
91
94
|
gene_ranks.delete_if{|code, positions| positions.length != signatures.uniq.length}
|
92
95
|
|
96
|
+
# Compute scores
|
93
97
|
scores = score(gene_ranks, sizes)
|
94
|
-
num_permutations = 50000
|
95
98
|
|
99
|
+
# Compute permutations
|
100
|
+
num_permutations = 50000
|
96
101
|
permutation_scores = permutations(sizes.length, num_permutations)
|
97
|
-
|
98
102
|
permutation_scores = permutation_scores.sort
|
99
103
|
|
100
104
|
|
105
|
+
# Compute p-values from permutations
|
101
106
|
results = {}
|
102
|
-
scores.each{|gene, score|
|
107
|
+
scores.each {|gene, score|
|
103
108
|
pos = permutation_scores.count_smaller(score)
|
104
109
|
results[gene] = [score, pos.to_f / num_permutations]
|
105
110
|
}
|
106
111
|
|
107
|
-
|
112
|
+
# Complete the information with pfp
|
108
113
|
num_genes = results.length
|
109
|
-
results.sort{|a,b|
|
114
|
+
results.sort {|a,b|
|
110
115
|
a[1][0] <=> b[1][0]
|
111
116
|
}.each_with_index{|p,i|
|
112
|
-
gene = p[0]
|
113
117
|
info = p[1]
|
114
118
|
pvalue = info[1]
|
119
|
+
|
115
120
|
pfp = pvalue * num_genes / (i + 1)
|
116
121
|
info << pfp
|
122
|
+
|
117
123
|
}
|
118
124
|
|
119
125
|
results
|