rbbt-marq 2.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -36,12 +36,11 @@ module ID
36
36
 
37
37
  other = codes[field]
38
38
  next if other.nil? || other == ""
39
-
40
39
 
41
40
  #codes.collect{|c| c.split("|")}.flatten.compact.select{|c| c != ""}.uniq.each{|code|
42
41
  other.split("|").each{|code|
43
42
  begin
44
- DBcache.fast_add(tablename, code.downcase, [native])
43
+ DBcache.fast_add(tablename, code.strip.downcase, [native])
45
44
  rescue
46
45
  puts $!.message
47
46
  end
@@ -8,50 +8,57 @@ module MADB
8
8
  # {{{ Saving Positions
9
9
 
10
10
  # Save the actual data, cross_platform or not
11
- def self.save_dataset_instance(dataset, cross_platform)
12
- dataset += '_cross_platform' if cross_platform
13
- prefix = MARQ::Dataset.path(dataset)
11
+ def self.save_dataset_instance(dataset)
14
12
 
15
- # Save codes
16
- codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
17
- experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
18
- orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
13
+ # Get info
14
+ codes = MARQ::Dataset.codes(dataset);
15
+ experiments = MARQ::Dataset.experiments(dataset);
16
+ orders = MARQ::Dataset.orders(dataset).values_at(*experiments).transpose;
19
17
 
20
18
  # Save codes and experiments
21
19
  DBcache.save(dataset + '_codes', codes)
22
20
  DBcache.save(dataset + '_experiments', experiments)
23
21
 
24
- # Save orders
22
+ # Asign orders to codes
25
23
  data = {}
26
24
  codes.each_with_index{|code,i|
27
- data[code.to_sym] = orders[i]
25
+ data[code] = orders[i]
28
26
  }
27
+
28
+ # Save orders
29
29
  case
30
30
  when codes.length < 65535
31
31
  type = "SMALLINT UNSIGNED"
32
32
  when codes.length < 16777215
33
- type = "MEDIUMIN UNSIGNED"
33
+ type = "MEDIUMINT UNSIGNED"
34
34
  else
35
35
  type = "INT UNSIGNED"
36
36
  end
37
-
38
37
  DBcache.save(dataset, data, [type] * orders.first.length)
38
+
39
39
  end
40
40
 
41
41
  # Save dataset, all instances, cross_platform if available.
42
42
  def self.save_dataset(dataset)
43
- save_dataset_instance(dataset, false)
44
- save_dataset_instance(dataset, true) if MARQ::Dataset.has_cross_platform?(dataset)
43
+ save_dataset_instance(dataset)
44
+ save_dataset_instance(MARQ::Name.cross_platform(dataset)) if MARQ::Dataset.has_cross_platform?(dataset)
45
45
  nil
46
46
  end
47
+
48
+ def self.save_platform_instance(platform)
49
+ DBcache.save(platform + '_codes',
50
+ MARQ::Platform.is_cross_platform?(platform) ?
51
+ MARQ::Platform.cross_platform(platform) :
52
+ MARQ::Platform.codes(platform))
53
+ end
47
54
 
48
55
  def self.save_platform(platform)
49
56
  datasets = MARQ::Platform.datasets(platform).sort
50
57
  return if datasets.empty?
51
58
 
52
- DBcache.save(platform + '_codes', MARQ::Platform.codes(platform))
53
- DBcache.save(platform + '_codes', MARQ::Platform.cross_platform(platform)) if MARQ::Platform.has_cross_platform? platform
54
-
59
+ save_platform_instance(platform)
60
+ save_platform_instance(MARQ::Name.cross_platform(platform)) if MARQ::Platform.has_cross_platform?(platform)
61
+
55
62
  datasets.sort.each do |dataset|
56
63
  save_dataset(dataset)
57
64
  end
@@ -60,10 +67,9 @@ module MADB
60
67
  # {{{ Loading Positions
61
68
 
62
69
  def self.platform_entries(platform)
63
- DBcache.num_rows(platform).to_i
70
+ DBcache.num_rows(platform + '_codes')
64
71
  end
65
72
 
66
-
67
73
  def self.load_positions(dataset, genes, platform_entries)
68
74
  gene_positions = DBcache.load(dataset, genes)
69
75
  data = {}
@@ -73,21 +79,21 @@ module MADB
73
79
  experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
74
80
  a[0].to_i <=> b[0].to_i
75
81
  }.collect{|p|
76
- MARQ::Dataset.clean(dataset) + ": " + p[1].first
82
+ MARQ::Name.clean(dataset) + ": " + p[1].first
77
83
  }
78
84
 
79
85
  # Get scale factors (to account for genes missing in the dataset)
80
86
  scale = (0..experiments.length - 1).collect{|i|
81
87
  rows = DBcache.num_rows(dataset, "C#{i}");
82
88
  if rows > 0
83
- platform_entries / rows
89
+ platform_entries.to_f / rows
84
90
  else
85
91
  nil
86
92
  end
87
93
  }
88
94
 
89
95
  # Get experiment positions and scale them
90
- experiment_x_gene = gene_positions.values.transpose
96
+ experiment_x_gene = gene_positions.values_at(*matched).transpose
91
97
  experiments.each_with_index{|experiment, i|
92
98
  next if scale[i].nil? || experiment_x_gene[i].nil?
93
99
  values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
@@ -100,8 +106,8 @@ module MADB
100
106
  def self.dataset_positions(dataset, genes)
101
107
  return [{},[],0] if genes.empty?
102
108
 
103
- genes = genes.collect{|gene| gene.downcase.strip}
104
- platform_entries = platform_entries(dataset + '_codes').to_f
109
+ genes = genes.collect{|gene| gene.to_s.downcase.strip}
110
+ platform_entries = platform_entries(dataset)
105
111
 
106
112
  load_positions(dataset, genes, platform_entries)
107
113
  end
@@ -111,7 +117,7 @@ module MADB
111
117
  return [{},[],0] if genes.empty?
112
118
 
113
119
  genes = genes.collect {|gene| gene.downcase.strip }
114
- platform_entries = platform_entries(platform).to_f
120
+ platform_entries = platform_entries(platform)
115
121
 
116
122
  cross_platform = MARQ::Platform.is_cross_platform? platform
117
123
  datasets = MARQ::Platform.datasets(platform).sort
@@ -120,7 +126,7 @@ module MADB
120
126
  total_matched = []
121
127
 
122
128
  datasets.each do |dataset|
123
- dataset << '_cross_platform' if cross_platform
129
+ dataset = MARQ::Name.cross_platform dataset if cross_platform
124
130
  data, matched = load_positions(dataset, genes, platform_entries)
125
131
  total_data = total_data.merge(data)
126
132
  total_matched += matched
@@ -299,7 +299,7 @@ double hypergeometric(double total, double support, double list, double found)
299
299
  def self.dataset_annotations(dataset, type, experiment)
300
300
  annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
301
301
 
302
- term_file = File.join(annotation_dir, type, MARQ::Dataset.clean(dataset))
302
+ term_file = File.join(annotation_dir, type, MARQ::Name.clean(dataset))
303
303
 
304
304
  if File.exist? term_file
305
305
  @@terms_cache[term_file] ||= YAML::load(File.open(term_file))
@@ -331,9 +331,9 @@ double hypergeometric(double total, double support, double list, double found)
331
331
  when side.nil?
332
332
  experiment_type = type
333
333
  when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
334
- experiment_type += '_up'
334
+ experiment_type = type + '_up'
335
335
  else
336
- experiment_type += '_down'
336
+ experiment_type = type + '_down'
337
337
  end
338
338
 
339
339
  annot[experiment] = dataset_annotations(dataset, experiment_type, name)
@@ -3,21 +3,39 @@ require 'MARQ/MADB'
3
3
  require 'MARQ/score'
4
4
 
5
5
  module MARQ
6
+ module Name
7
+ def self.clean(name)
8
+ name.sub(/_cross_platform/,'') unless name.nil?
9
+ end
10
+
11
+ def self.cross_platform(name)
12
+ if name =~ /_cross_platform/
13
+ name
14
+ else
15
+ name + "_cross_platform"
16
+ end
17
+ end
18
+
19
+ def self.is_cross_platform?(name)
20
+ ! name.match(/_cross_platform$/).nil?
21
+ end
22
+
23
+ def self.is_ratio?(name)
24
+ ! name.match(/\[ratio\]$/).nil?
25
+ end
26
+ end
27
+
6
28
  module Platform
7
29
  def self.is_GEO?(platform)
8
30
  ! platform.match(/^GPL/).nil?
9
31
  end
10
32
 
11
33
  def self.is_cross_platform?(platform)
12
- ! platform.match(/_cross_platform$/).nil?
13
- end
14
-
15
- def self.clean(name)
16
- name.sub(/_cross_platform/,'') unless name.nil?
34
+ MARQ::Name.is_cross_platform? platform
17
35
  end
18
36
 
19
37
  def self.path(platform)
20
- platform = clean(platform)
38
+ platform = MARQ::Name.clean(platform)
21
39
  if is_GEO? platform
22
40
  GEO.platform_path(platform)
23
41
  else
@@ -25,6 +43,10 @@ module MARQ
25
43
  end
26
44
  end
27
45
 
46
+ def self.exists?(platform)
47
+ path(platform) != nil
48
+ end
49
+
28
50
  def self.has_cross_platform?(platform)
29
51
  File.exists? File.join(path(platform), 'cross_platform')
30
52
  end
@@ -38,17 +60,17 @@ module MARQ
38
60
  end
39
61
 
40
62
  def self.codes(platform)
41
- platform = clean(platform)
63
+ platform = MARQ::Name.clean(platform)
42
64
  Open.read(File.join(path(platform), 'codes')).scan(/[^\s]+/)
43
65
  end
44
66
 
45
67
  def self.cross_platform(platform)
46
- platform = clean(platform)
68
+ platform = MARQ::Name.clean(platform)
47
69
  Open.read(File.join(path(platform), 'cross_platform')).scan(/[^\s]+/)
48
70
  end
49
71
 
50
72
  def self.organism(platform)
51
- platform = clean(platform)
73
+ platform = MARQ::Name.clean(platform)
52
74
  if is_GEO? platform
53
75
  GEO.platform_organism platform
54
76
  else
@@ -57,7 +79,7 @@ module MARQ
57
79
  end
58
80
 
59
81
  def self.process(platform)
60
- platform = clean(platform)
82
+ platform = MARQ::Name.clean(platform)
61
83
  if is_GEO? platform
62
84
  GEO.process_platform(platform)
63
85
  else
@@ -79,10 +101,6 @@ module MARQ
79
101
  ! dataset.match(/^(?:GDS|GSE)/).nil?
80
102
  end
81
103
 
82
- def self.clean(name)
83
- name.sub(/_cross_platform/,'') if name
84
- end
85
-
86
104
  def self.path(platform)
87
105
  if is_GEO? platform
88
106
  GEO.dataset_path(platform)
@@ -92,21 +110,34 @@ module MARQ
92
110
  end
93
111
 
94
112
  def self.exists?(dataset)
95
- ! path(dataset).nil?
113
+ path = path(dataset)
114
+ if path.nil?
115
+ return false
116
+ else
117
+ return File.exists?(path + '.orders')
118
+ end
119
+ end
120
+
121
+ def self.broken?(dataset)
122
+ path = path(dataset)
123
+
124
+ return false if path.nil?
125
+
126
+ if File.exists?(path + '.skip')
127
+ return true
128
+ else
129
+ return false
130
+ end
96
131
  end
97
132
 
98
133
  def self.is_cross_platform?(dataset)
99
- ! dataset.match(/_cross_platform$/).nil?
134
+ MARQ::Name.is_cross_platform? dataset
100
135
  end
101
136
 
102
137
  def self.has_cross_platform?(dataset)
103
138
  File.exists?(path(dataset) + '_cross_platform.orders')
104
139
  end
105
140
 
106
- def self.exists?(dataset)
107
- path(dataset) != nil
108
- end
109
-
110
141
  def self.info(name)
111
142
  begin
112
143
  title, description = Open.read(path(name) + '.description').split(/\n--\n/).values_at(0,1)
@@ -159,10 +190,12 @@ module MARQ
159
190
  end
160
191
 
161
192
  def self.read_values_t(dataset, file)
193
+ experiments = experiments(dataset).reject{|experiment| MARQ::Name.is_ratio? experiment }
194
+
195
+ return {} if experiments.empty?
196
+
162
197
  result = {}
163
198
 
164
- experiments = experiments(dataset).select{|experiment| experiment !~ /\[ratio\]$/ }
165
- return {} if experiments.empty?
166
199
  experiments.each{|experiment| result[experiment] = [] }
167
200
 
168
201
  read_file(dataset, file).split(/\n/).each do |line|
@@ -175,7 +208,7 @@ module MARQ
175
208
 
176
209
 
177
210
  def self.experiments(dataset)
178
- read_file(dataset, 'experiments').split(/\n/)
211
+ read_file(dataset, 'experiments').split(/\n/).collect{|exp| exp.strip }
179
212
  end
180
213
 
181
214
  def self.codes(dataset)
@@ -198,29 +231,40 @@ module MARQ
198
231
  read_values_t(dataset, 't')
199
232
  end
200
233
 
234
+ def self.codes_for(dataset, type, experiment)
235
+ codes = codes(dataset)
236
+ values = send(type, dataset)[experiment]
237
+ Hash[*codes.zip(values).reject{|p| p.last.nil? }.flatten]
238
+ end
239
+
201
240
  end
202
241
 
203
242
  module RankQuery
204
243
  def self.complete_positions(positions, matched, genes)
244
+ matched = matched.collect{|gene| gene.strip.downcase}
245
+ genes = genes.collect{|gene| gene.strip.downcase}
246
+
205
247
  pos = Hash[*matched.zip(positions).flatten]
248
+
206
249
  complete = genes.collect{|gene|
207
- gene = gene.downcase.strip
208
250
  if matched.include? gene
209
251
  pos[gene] || "MISSING"
210
252
  else
211
253
  "NOT IN PLATFORM"
212
254
  end
213
255
  }
256
+ complete
214
257
  end
215
258
 
216
259
 
217
260
  def self.position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
218
261
  scores = []
262
+
219
263
  positions_up.keys.each do |experiment|
220
264
  score = Score.score_up_down(positions_up[experiment], positions_down[experiment], platform_entries, missing_up, missing_down)
221
265
  score[:total_entries] = platform_entries
222
- score[:positions_up] = complete_positions(positions_up[experiment], matched_up, up) if up.any?
223
- score[:positions_down] = complete_positions(positions_down[experiment], matched_down, down) if down.any?
266
+ score[:positions_up] = complete_positions(positions_up[experiment] || [], matched_up, up) if up.any?
267
+ score[:positions_down] = complete_positions(positions_down[experiment] || [], matched_down, down) if down.any?
224
268
  scores << score
225
269
  end
226
270
 
@@ -248,12 +292,27 @@ module MARQ
248
292
  positions_up, matched_up, platform_entries = MADB.platform_positions(platform, up)
249
293
  missing_up = up.length - matched_up.length
250
294
 
295
+
251
296
  positions_down, matched_down = MADB.platform_positions(platform, down)
252
297
  missing_down = down.length - matched_down.length
253
298
 
254
299
  position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
255
300
  end
256
301
 
302
+ def self.organism_scores(organism, up, down)
303
+ platforms = MARQ::Platform.organism_platforms(organism).
304
+ select {|p| MARQ::Platform.has_cross_platform? p }.
305
+ collect {|p| MARQ::Name.cross_platform p }
306
+
307
+ total_scores = {}
308
+ platforms.each do |platform|
309
+ scores = platform_scores(platform, up, down)
310
+ total_scores.merge!(scores)
311
+ end
312
+
313
+ total_scores
314
+ end
315
+
257
316
  end
258
317
  end
259
318
 
@@ -17,7 +17,7 @@ module RankProduct
17
17
  orders = MARQ::Dataset.orders(dataset)[experiment.strip]
18
18
 
19
19
  if invert
20
- num_genes = orders.length
20
+ num_genes = codes.length + 1
21
21
  orders.collect! {|pos| pos.nil? ? nil : num_genes - pos }
22
22
  end
23
23
 
@@ -31,7 +31,7 @@ module RankProduct
31
31
  log_sizes = signature_sizes.collect{|size| Math::log(size)}
32
32
  gene_ranks.each{|gene, positions|
33
33
  scores[gene] = positions.zip(log_sizes).
34
- collect{|p| Math::log(p[0]) - p[1]}.
34
+ collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
35
35
  inject(0){|acc, v| acc += v }
36
36
  }
37
37
  scores
@@ -70,6 +70,7 @@ module RankProduct
70
70
  :cross_platform => false,
71
71
  }.merge(options).values_at(:invert, :from_FC, :cross_platform)
72
72
 
73
+ # Gather gene ranks from signatures
73
74
  ranks = {}
74
75
  signatures.each{|signature|
75
76
  dataset, experiment = signature.match(/^([^\:]*): (.*)/).values_at(1,2)
@@ -77,6 +78,7 @@ module RankProduct
77
78
  ranks[signature] = self.ranks(dataset, experiment, from_FC, invert.include?(signature))
78
79
  }
79
80
 
81
+ # Invert the hash, from signature keys to gene keys
80
82
  gene_ranks = {}
81
83
  sizes = []
82
84
  ranks.each{|signature, orders|
@@ -88,32 +90,36 @@ module RankProduct
88
90
  }
89
91
  }
90
92
 
93
+ # Remove incomplete genes
91
94
  gene_ranks.delete_if{|code, positions| positions.length != signatures.uniq.length}
92
95
 
96
+ # Compute scores
93
97
  scores = score(gene_ranks, sizes)
94
- num_permutations = 50000
95
98
 
99
+ # Compute permutations
100
+ num_permutations = 50000
96
101
  permutation_scores = permutations(sizes.length, num_permutations)
97
-
98
102
  permutation_scores = permutation_scores.sort
99
103
 
100
104
 
105
+ # Compute p-values from permutations
101
106
  results = {}
102
- scores.each{|gene, score|
107
+ scores.each {|gene, score|
103
108
  pos = permutation_scores.count_smaller(score)
104
109
  results[gene] = [score, pos.to_f / num_permutations]
105
110
  }
106
111
 
107
-
112
+ # Complete the information with pfp
108
113
  num_genes = results.length
109
- results.sort{|a,b|
114
+ results.sort {|a,b|
110
115
  a[1][0] <=> b[1][0]
111
116
  }.each_with_index{|p,i|
112
- gene = p[0]
113
117
  info = p[1]
114
118
  pvalue = info[1]
119
+
115
120
  pfp = pvalue * num_genes / (i + 1)
116
121
  info << pfp
122
+
117
123
  }
118
124
 
119
125
  results