rbbt-marq 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,12 +36,11 @@ module ID
36
36
 
37
37
  other = codes[field]
38
38
  next if other.nil? || other == ""
39
-
40
39
 
41
40
  #codes.collect{|c| c.split("|")}.flatten.compact.select{|c| c != ""}.uniq.each{|code|
42
41
  other.split("|").each{|code|
43
42
  begin
44
- DBcache.fast_add(tablename, code.downcase, [native])
43
+ DBcache.fast_add(tablename, code.strip.downcase, [native])
45
44
  rescue
46
45
  puts $!.message
47
46
  end
@@ -8,50 +8,57 @@ module MADB
8
8
  # {{{ Saving Positions
9
9
 
10
10
  # Save the actual data, cross_platform or not
11
- def self.save_dataset_instance(dataset, cross_platform)
12
- dataset += '_cross_platform' if cross_platform
13
- prefix = MARQ::Dataset.path(dataset)
11
+ def self.save_dataset_instance(dataset)
14
12
 
15
- # Save codes
16
- codes = File.open(prefix + '.codes').collect{|l| l.chomp.downcase}
17
- experiments = File.open(prefix + '.experiments').collect{|l| l.chomp}
18
- orders = File.open(prefix + '.orders').collect{|l| values = l.chomp.split(/\t/).collect{|v| v == "NA" ? nil : v.to_i };}
13
+ # Get info
14
+ codes = MARQ::Dataset.codes(dataset);
15
+ experiments = MARQ::Dataset.experiments(dataset);
16
+ orders = MARQ::Dataset.orders(dataset).values_at(*experiments).transpose;
19
17
 
20
18
  # Save codes and experiments
21
19
  DBcache.save(dataset + '_codes', codes)
22
20
  DBcache.save(dataset + '_experiments', experiments)
23
21
 
24
- # Save orders
22
+ # Asign orders to codes
25
23
  data = {}
26
24
  codes.each_with_index{|code,i|
27
- data[code.to_sym] = orders[i]
25
+ data[code] = orders[i]
28
26
  }
27
+
28
+ # Save orders
29
29
  case
30
30
  when codes.length < 65535
31
31
  type = "SMALLINT UNSIGNED"
32
32
  when codes.length < 16777215
33
- type = "MEDIUMIN UNSIGNED"
33
+ type = "MEDIUMINT UNSIGNED"
34
34
  else
35
35
  type = "INT UNSIGNED"
36
36
  end
37
-
38
37
  DBcache.save(dataset, data, [type] * orders.first.length)
38
+
39
39
  end
40
40
 
41
41
  # Save dataset, all instances, cross_platform if available.
42
42
  def self.save_dataset(dataset)
43
- save_dataset_instance(dataset, false)
44
- save_dataset_instance(dataset, true) if MARQ::Dataset.has_cross_platform?(dataset)
43
+ save_dataset_instance(dataset)
44
+ save_dataset_instance(MARQ::Name.cross_platform(dataset)) if MARQ::Dataset.has_cross_platform?(dataset)
45
45
  nil
46
46
  end
47
+
48
+ def self.save_platform_instance(platform)
49
+ DBcache.save(platform + '_codes',
50
+ MARQ::Platform.is_cross_platform?(platform) ?
51
+ MARQ::Platform.cross_platform(platform) :
52
+ MARQ::Platform.codes(platform))
53
+ end
47
54
 
48
55
  def self.save_platform(platform)
49
56
  datasets = MARQ::Platform.datasets(platform).sort
50
57
  return if datasets.empty?
51
58
 
52
- DBcache.save(platform + '_codes', MARQ::Platform.codes(platform))
53
- DBcache.save(platform + '_codes', MARQ::Platform.cross_platform(platform)) if MARQ::Platform.has_cross_platform? platform
54
-
59
+ save_platform_instance(platform)
60
+ save_platform_instance(MARQ::Name.cross_platform(platform)) if MARQ::Platform.has_cross_platform?(platform)
61
+
55
62
  datasets.sort.each do |dataset|
56
63
  save_dataset(dataset)
57
64
  end
@@ -60,10 +67,9 @@ module MADB
60
67
  # {{{ Loading Positions
61
68
 
62
69
  def self.platform_entries(platform)
63
- DBcache.num_rows(platform).to_i
70
+ DBcache.num_rows(platform + '_codes')
64
71
  end
65
72
 
66
-
67
73
  def self.load_positions(dataset, genes, platform_entries)
68
74
  gene_positions = DBcache.load(dataset, genes)
69
75
  data = {}
@@ -73,21 +79,21 @@ module MADB
73
79
  experiments = DBcache.load(dataset + '_experiments').sort{|a,b|
74
80
  a[0].to_i <=> b[0].to_i
75
81
  }.collect{|p|
76
- MARQ::Dataset.clean(dataset) + ": " + p[1].first
82
+ MARQ::Name.clean(dataset) + ": " + p[1].first
77
83
  }
78
84
 
79
85
  # Get scale factors (to account for genes missing in the dataset)
80
86
  scale = (0..experiments.length - 1).collect{|i|
81
87
  rows = DBcache.num_rows(dataset, "C#{i}");
82
88
  if rows > 0
83
- platform_entries / rows
89
+ platform_entries.to_f / rows
84
90
  else
85
91
  nil
86
92
  end
87
93
  }
88
94
 
89
95
  # Get experiment positions and scale them
90
- experiment_x_gene = gene_positions.values.transpose
96
+ experiment_x_gene = gene_positions.values_at(*matched).transpose
91
97
  experiments.each_with_index{|experiment, i|
92
98
  next if scale[i].nil? || experiment_x_gene[i].nil?
93
99
  values = experiment_x_gene[i].collect{|v| v.nil? ? nil : (v.to_f * scale[i]).to_i}
@@ -100,8 +106,8 @@ module MADB
100
106
  def self.dataset_positions(dataset, genes)
101
107
  return [{},[],0] if genes.empty?
102
108
 
103
- genes = genes.collect{|gene| gene.downcase.strip}
104
- platform_entries = platform_entries(dataset + '_codes').to_f
109
+ genes = genes.collect{|gene| gene.to_s.downcase.strip}
110
+ platform_entries = platform_entries(dataset)
105
111
 
106
112
  load_positions(dataset, genes, platform_entries)
107
113
  end
@@ -111,7 +117,7 @@ module MADB
111
117
  return [{},[],0] if genes.empty?
112
118
 
113
119
  genes = genes.collect {|gene| gene.downcase.strip }
114
- platform_entries = platform_entries(platform).to_f
120
+ platform_entries = platform_entries(platform)
115
121
 
116
122
  cross_platform = MARQ::Platform.is_cross_platform? platform
117
123
  datasets = MARQ::Platform.datasets(platform).sort
@@ -120,7 +126,7 @@ module MADB
120
126
  total_matched = []
121
127
 
122
128
  datasets.each do |dataset|
123
- dataset << '_cross_platform' if cross_platform
129
+ dataset = MARQ::Name.cross_platform dataset if cross_platform
124
130
  data, matched = load_positions(dataset, genes, platform_entries)
125
131
  total_data = total_data.merge(data)
126
132
  total_matched += matched
@@ -299,7 +299,7 @@ double hypergeometric(double total, double support, double list, double found)
299
299
  def self.dataset_annotations(dataset, type, experiment)
300
300
  annotation_dir = File.join(MARQ.datadir, (MARQ::Dataset.is_GEO?(dataset) ? 'GEO' : 'CustomDS'), 'annotations')
301
301
 
302
- term_file = File.join(annotation_dir, type, MARQ::Dataset.clean(dataset))
302
+ term_file = File.join(annotation_dir, type, MARQ::Name.clean(dataset))
303
303
 
304
304
  if File.exist? term_file
305
305
  @@terms_cache[term_file] ||= YAML::load(File.open(term_file))
@@ -331,9 +331,9 @@ double hypergeometric(double total, double support, double list, double found)
331
331
  when side.nil?
332
332
  experiment_type = type
333
333
  when side == :direct && info[:score] >= 0 || side == :inverse && info[:score] < 0
334
- experiment_type += '_up'
334
+ experiment_type = type + '_up'
335
335
  else
336
- experiment_type += '_down'
336
+ experiment_type = type + '_down'
337
337
  end
338
338
 
339
339
  annot[experiment] = dataset_annotations(dataset, experiment_type, name)
@@ -3,21 +3,39 @@ require 'MARQ/MADB'
3
3
  require 'MARQ/score'
4
4
 
5
5
  module MARQ
6
+ module Name
7
+ def self.clean(name)
8
+ name.sub(/_cross_platform/,'') unless name.nil?
9
+ end
10
+
11
+ def self.cross_platform(name)
12
+ if name =~ /_cross_platform/
13
+ name
14
+ else
15
+ name + "_cross_platform"
16
+ end
17
+ end
18
+
19
+ def self.is_cross_platform?(name)
20
+ ! name.match(/_cross_platform$/).nil?
21
+ end
22
+
23
+ def self.is_ratio?(name)
24
+ ! name.match(/\[ratio\]$/).nil?
25
+ end
26
+ end
27
+
6
28
  module Platform
7
29
  def self.is_GEO?(platform)
8
30
  ! platform.match(/^GPL/).nil?
9
31
  end
10
32
 
11
33
  def self.is_cross_platform?(platform)
12
- ! platform.match(/_cross_platform$/).nil?
13
- end
14
-
15
- def self.clean(name)
16
- name.sub(/_cross_platform/,'') unless name.nil?
34
+ MARQ::Name.is_cross_platform? platform
17
35
  end
18
36
 
19
37
  def self.path(platform)
20
- platform = clean(platform)
38
+ platform = MARQ::Name.clean(platform)
21
39
  if is_GEO? platform
22
40
  GEO.platform_path(platform)
23
41
  else
@@ -25,6 +43,10 @@ module MARQ
25
43
  end
26
44
  end
27
45
 
46
+ def self.exists?(platform)
47
+ path(platform) != nil
48
+ end
49
+
28
50
  def self.has_cross_platform?(platform)
29
51
  File.exists? File.join(path(platform), 'cross_platform')
30
52
  end
@@ -38,17 +60,17 @@ module MARQ
38
60
  end
39
61
 
40
62
  def self.codes(platform)
41
- platform = clean(platform)
63
+ platform = MARQ::Name.clean(platform)
42
64
  Open.read(File.join(path(platform), 'codes')).scan(/[^\s]+/)
43
65
  end
44
66
 
45
67
  def self.cross_platform(platform)
46
- platform = clean(platform)
68
+ platform = MARQ::Name.clean(platform)
47
69
  Open.read(File.join(path(platform), 'cross_platform')).scan(/[^\s]+/)
48
70
  end
49
71
 
50
72
  def self.organism(platform)
51
- platform = clean(platform)
73
+ platform = MARQ::Name.clean(platform)
52
74
  if is_GEO? platform
53
75
  GEO.platform_organism platform
54
76
  else
@@ -57,7 +79,7 @@ module MARQ
57
79
  end
58
80
 
59
81
  def self.process(platform)
60
- platform = clean(platform)
82
+ platform = MARQ::Name.clean(platform)
61
83
  if is_GEO? platform
62
84
  GEO.process_platform(platform)
63
85
  else
@@ -79,10 +101,6 @@ module MARQ
79
101
  ! dataset.match(/^(?:GDS|GSE)/).nil?
80
102
  end
81
103
 
82
- def self.clean(name)
83
- name.sub(/_cross_platform/,'') if name
84
- end
85
-
86
104
  def self.path(platform)
87
105
  if is_GEO? platform
88
106
  GEO.dataset_path(platform)
@@ -92,21 +110,34 @@ module MARQ
92
110
  end
93
111
 
94
112
  def self.exists?(dataset)
95
- ! path(dataset).nil?
113
+ path = path(dataset)
114
+ if path.nil?
115
+ return false
116
+ else
117
+ return File.exists?(path + '.orders')
118
+ end
119
+ end
120
+
121
+ def self.broken?(dataset)
122
+ path = path(dataset)
123
+
124
+ return false if path.nil?
125
+
126
+ if File.exists?(path + '.skip')
127
+ return true
128
+ else
129
+ return false
130
+ end
96
131
  end
97
132
 
98
133
  def self.is_cross_platform?(dataset)
99
- ! dataset.match(/_cross_platform$/).nil?
134
+ MARQ::Name.is_cross_platform? dataset
100
135
  end
101
136
 
102
137
  def self.has_cross_platform?(dataset)
103
138
  File.exists?(path(dataset) + '_cross_platform.orders')
104
139
  end
105
140
 
106
- def self.exists?(dataset)
107
- path(dataset) != nil
108
- end
109
-
110
141
  def self.info(name)
111
142
  begin
112
143
  title, description = Open.read(path(name) + '.description').split(/\n--\n/).values_at(0,1)
@@ -159,10 +190,12 @@ module MARQ
159
190
  end
160
191
 
161
192
  def self.read_values_t(dataset, file)
193
+ experiments = experiments(dataset).reject{|experiment| MARQ::Name.is_ratio? experiment }
194
+
195
+ return {} if experiments.empty?
196
+
162
197
  result = {}
163
198
 
164
- experiments = experiments(dataset).select{|experiment| experiment !~ /\[ratio\]$/ }
165
- return {} if experiments.empty?
166
199
  experiments.each{|experiment| result[experiment] = [] }
167
200
 
168
201
  read_file(dataset, file).split(/\n/).each do |line|
@@ -175,7 +208,7 @@ module MARQ
175
208
 
176
209
 
177
210
  def self.experiments(dataset)
178
- read_file(dataset, 'experiments').split(/\n/)
211
+ read_file(dataset, 'experiments').split(/\n/).collect{|exp| exp.strip }
179
212
  end
180
213
 
181
214
  def self.codes(dataset)
@@ -198,29 +231,40 @@ module MARQ
198
231
  read_values_t(dataset, 't')
199
232
  end
200
233
 
234
+ def self.codes_for(dataset, type, experiment)
235
+ codes = codes(dataset)
236
+ values = send(type, dataset)[experiment]
237
+ Hash[*codes.zip(values).reject{|p| p.last.nil? }.flatten]
238
+ end
239
+
201
240
  end
202
241
 
203
242
  module RankQuery
204
243
  def self.complete_positions(positions, matched, genes)
244
+ matched = matched.collect{|gene| gene.strip.downcase}
245
+ genes = genes.collect{|gene| gene.strip.downcase}
246
+
205
247
  pos = Hash[*matched.zip(positions).flatten]
248
+
206
249
  complete = genes.collect{|gene|
207
- gene = gene.downcase.strip
208
250
  if matched.include? gene
209
251
  pos[gene] || "MISSING"
210
252
  else
211
253
  "NOT IN PLATFORM"
212
254
  end
213
255
  }
256
+ complete
214
257
  end
215
258
 
216
259
 
217
260
  def self.position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
218
261
  scores = []
262
+
219
263
  positions_up.keys.each do |experiment|
220
264
  score = Score.score_up_down(positions_up[experiment], positions_down[experiment], platform_entries, missing_up, missing_down)
221
265
  score[:total_entries] = platform_entries
222
- score[:positions_up] = complete_positions(positions_up[experiment], matched_up, up) if up.any?
223
- score[:positions_down] = complete_positions(positions_down[experiment], matched_down, down) if down.any?
266
+ score[:positions_up] = complete_positions(positions_up[experiment] || [], matched_up, up) if up.any?
267
+ score[:positions_down] = complete_positions(positions_down[experiment] || [], matched_down, down) if down.any?
224
268
  scores << score
225
269
  end
226
270
 
@@ -248,12 +292,27 @@ module MARQ
248
292
  positions_up, matched_up, platform_entries = MADB.platform_positions(platform, up)
249
293
  missing_up = up.length - matched_up.length
250
294
 
295
+
251
296
  positions_down, matched_down = MADB.platform_positions(platform, down)
252
297
  missing_down = down.length - matched_down.length
253
298
 
254
299
  position_scores(up, down, positions_up, positions_down, platform_entries, matched_up, matched_down, missing_up, missing_down)
255
300
  end
256
301
 
302
+ def self.organism_scores(organism, up, down)
303
+ platforms = MARQ::Platform.organism_platforms(organism).
304
+ select {|p| MARQ::Platform.has_cross_platform? p }.
305
+ collect {|p| MARQ::Name.cross_platform p }
306
+
307
+ total_scores = {}
308
+ platforms.each do |platform|
309
+ scores = platform_scores(platform, up, down)
310
+ total_scores.merge!(scores)
311
+ end
312
+
313
+ total_scores
314
+ end
315
+
257
316
  end
258
317
  end
259
318
 
@@ -17,7 +17,7 @@ module RankProduct
17
17
  orders = MARQ::Dataset.orders(dataset)[experiment.strip]
18
18
 
19
19
  if invert
20
- num_genes = orders.length
20
+ num_genes = codes.length + 1
21
21
  orders.collect! {|pos| pos.nil? ? nil : num_genes - pos }
22
22
  end
23
23
 
@@ -31,7 +31,7 @@ module RankProduct
31
31
  log_sizes = signature_sizes.collect{|size| Math::log(size)}
32
32
  gene_ranks.each{|gene, positions|
33
33
  scores[gene] = positions.zip(log_sizes).
34
- collect{|p| Math::log(p[0]) - p[1]}.
34
+ collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
35
35
  inject(0){|acc, v| acc += v }
36
36
  }
37
37
  scores
@@ -70,6 +70,7 @@ module RankProduct
70
70
  :cross_platform => false,
71
71
  }.merge(options).values_at(:invert, :from_FC, :cross_platform)
72
72
 
73
+ # Gather gene ranks from signatures
73
74
  ranks = {}
74
75
  signatures.each{|signature|
75
76
  dataset, experiment = signature.match(/^([^\:]*): (.*)/).values_at(1,2)
@@ -77,6 +78,7 @@ module RankProduct
77
78
  ranks[signature] = self.ranks(dataset, experiment, from_FC, invert.include?(signature))
78
79
  }
79
80
 
81
+ # Invert the hash, from signature keys to gene keys
80
82
  gene_ranks = {}
81
83
  sizes = []
82
84
  ranks.each{|signature, orders|
@@ -88,32 +90,36 @@ module RankProduct
88
90
  }
89
91
  }
90
92
 
93
+ # Remove incomplete genes
91
94
  gene_ranks.delete_if{|code, positions| positions.length != signatures.uniq.length}
92
95
 
96
+ # Compute scores
93
97
  scores = score(gene_ranks, sizes)
94
- num_permutations = 50000
95
98
 
99
+ # Compute permutations
100
+ num_permutations = 50000
96
101
  permutation_scores = permutations(sizes.length, num_permutations)
97
-
98
102
  permutation_scores = permutation_scores.sort
99
103
 
100
104
 
105
+ # Compute p-values from permutations
101
106
  results = {}
102
- scores.each{|gene, score|
107
+ scores.each {|gene, score|
103
108
  pos = permutation_scores.count_smaller(score)
104
109
  results[gene] = [score, pos.to_f / num_permutations]
105
110
  }
106
111
 
107
-
112
+ # Complete the information with pfp
108
113
  num_genes = results.length
109
- results.sort{|a,b|
114
+ results.sort {|a,b|
110
115
  a[1][0] <=> b[1][0]
111
116
  }.each_with_index{|p,i|
112
- gene = p[0]
113
117
  info = p[1]
114
118
  pvalue = info[1]
119
+
115
120
  pfp = pvalue * num_genes / (i + 1)
116
121
  info << pfp
122
+
117
123
  }
118
124
 
119
125
  results