rbbt-dm 0.0.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,69 +1,171 @@
1
1
  require 'png'
2
2
  require 'inline'
3
3
  require 'set'
4
+ require 'rbbt/util/misc'
4
5
 
5
6
  module RandomWalk
6
7
 
7
- class << self
8
- inline do |builder|
8
+ inline do |builder|
9
+
10
+ builder.prefix <<-EOC_CODE
11
+ #include <math.h>
12
+ #include <time.h>
13
+ //{{{ Make compatible with 1.9 and 1.8
14
+ #ifndef RUBY_19
15
+ #ifndef RFLOAT_VALUE
16
+ #define RFLOAT_VALUE(v) (RFLOAT(v)->value)
17
+ #endif
18
+ #ifndef RARRAY_PTR
19
+ #define RARRAY_PTR(v) (RARRAY(v)->ptr)
20
+ #endif
21
+ #ifndef RARRAY_LEN
22
+ #define RARRAY_LEN(v) (RARRAY(v)->len)
23
+ #endif
24
+ #endif
25
+ //}}} Make compatible with 1.9 and 1.8
26
+ EOC_CODE
27
+
28
+ builder.c_singleton <<-'EOC'
29
+ void sample_without_replacement ( int populationSize, int sampleSize, VALUE positions) {
30
+ // Use Knuth's variable names
31
+ int n = sampleSize;
32
+ int N = populationSize;
33
+
34
+ int t = 0; // total input records dealt with
35
+ int m = 0; // number of items selected so far
36
+ double u;
37
+
38
+ //srand ( (unsigned)time ( NULL ) );
39
+ while (m < n)
40
+ {
41
+ u = (double) rand() / ((double) RAND_MAX + 1.0);
42
+
43
+ if ( (N - t)*u >= n - m )
44
+ {
45
+ t++;
46
+ }
47
+ else
48
+ {
49
+ rb_ary_push(positions, rb_int_new(t));
50
+ t++; m++;
51
+ }
52
+ }
53
+ }
54
+ EOC
9
55
 
10
- builder.c_raw <<-'EOC'
11
- double weight(int position, int mean){
12
- double rel_pos = (double) abs(position - mean) / mean;
56
+ builder.c_raw_singleton <<-'EOC'
57
+ double fitted_weight(int position, int medium){
58
+ double rel_pos = (double) abs(position - medium) / medium;
13
59
  double weight = 0.3 * 0.5 * rel_pos + 0.7 * (exp(30*rel_pos)/exp(30));
14
60
  return(weight);
15
61
  }
16
- EOC
62
+ EOC
17
63
 
18
- builder.c <<-'EOC'
19
- double fast_score_scale(VALUE positions, int total, int missing){
64
+ builder.c_singleton <<-'EOC'
65
+ double score_fitted_weight(VALUE positions, int total, int missing){
20
66
  int idx;
21
-
22
- int mean = total / 2;
23
67
 
24
- VALUE rel_q = rb_ary_new();
68
+ int medium = total / 2;
69
+ int position;
70
+ double penalty;
71
+ double max_top, max_bottom;
72
+ double hit_weights = 0;
73
+
25
74
  VALUE rel_l = rb_ary_new();
26
-
75
+ VALUE rel_q = rb_ary_new();
76
+
27
77
  rb_ary_push(rel_q,rb_float_new(0));
28
78
 
29
79
  // Rescale positions and accumulate weights
30
- double total_weights = 0;
31
- for (idx = 0; idx < RARRAY(positions)->len; idx++){
32
- int position = FIX2INT(rb_ary_entry(positions, idx));
80
+
81
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
82
+ position = FIX2INT(rb_ary_entry(positions, idx));
33
83
 
34
84
  rb_ary_push(rel_l, rb_float_new((double) position / total));
35
85
 
36
- total_weights += weight(position, mean);
37
- rb_ary_push(rel_q, rb_float_new(total_weights));
86
+ hit_weights += fitted_weight(position, medium);
87
+ rb_ary_push(rel_q, rb_float_new(hit_weights));
38
88
  }
39
89
 
40
90
  // Add penalty for missing genes
41
- double penalty = missing * weight(mean * 0.8, mean);
42
- total_weights = total_weights + penalty;
43
-
44
- // Traverse list and get extreme values
91
+ penalty = missing * fitted_weight(medium * 0.8, medium);
92
+ hit_weights = hit_weights + penalty;
93
+
94
+ // Traverse list and get extreme values of:
95
+ // Proportion of weight covered - Proportion of hits covered
96
+
97
+ max_top = max_bottom = 0;
98
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
99
+ double top = RFLOAT_VALUE(rb_ary_entry(rel_q, idx + 1)) / hit_weights -
100
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
101
+ double bottom = - (penalty + RFLOAT_VALUE(rb_ary_entry(rel_q, idx))) / hit_weights +
102
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
103
+
104
+ if (top > max_top) max_top = top;
105
+ if (bottom > max_bottom) max_bottom = bottom;
106
+ }
107
+
108
+ if (max_top > max_bottom) return max_top;
109
+ else return -max_bottom;
110
+ }
111
+ EOC
112
+
113
+
114
+ builder.c_singleton <<-'EOC'
115
+ double score_custom_weights(VALUE positions, VALUE weights, int total_weights, int total, int missing){
116
+ int idx;
117
+
118
+ int medium = total / 2;
119
+ int position;
120
+ double penalty;
45
121
  double max_top, max_bottom;
122
+ double hit_weights = 0;
123
+
124
+ VALUE rel_l = rb_ary_new();
125
+ VALUE rel_q = rb_ary_new();
126
+
127
+ rb_ary_push(rel_q,rb_float_new(0));
128
+
129
+ // Rescale positions and accumulate weights
130
+
131
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
132
+ position = FIX2INT(rb_ary_entry(positions, idx));
133
+
134
+ rb_ary_push(rel_l, rb_float_new((double) position / total));
135
+
136
+ hit_weights += rb_ary_entry(weights, position);
137
+ rb_ary_push(rel_q, rb_float_new(hit_weights / total_weights));
138
+ }
139
+
140
+ // Add penalty for missing genes
141
+ penalty = missing * rb_ary_entry(weights, (int) medium * 0.8);
142
+ hit_weights = hit_weights + penalty;
143
+ hit_weights = hit_weights / total_weights;
144
+
145
+ // Traverse list and get extreme values of:
146
+ // Proportion of weight covered - Proportion of hits covered
147
+
46
148
  max_top = max_bottom = 0;
47
- for (idx = 0; idx < RARRAY(positions)->len; idx++){
48
- double top = RFLOAT(rb_ary_entry(rel_q, idx + 1))->value / total_weights -
49
- RFLOAT(rb_ary_entry(rel_l, idx))->value;
50
- double bottom = - (penalty + RFLOAT(rb_ary_entry(rel_q, idx))->value) / total_weights +
51
- RFLOAT(rb_ary_entry(rel_l, idx))->value;
149
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
150
+ double top = RFLOAT_VALUE(rb_ary_entry(rel_q, idx + 1)) / hit_weights -
151
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
152
+ double bottom = - (penalty + RFLOAT_VALUE(rb_ary_entry(rel_q, idx))) / hit_weights +
153
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
52
154
 
53
155
  if (top > max_top) max_top = top;
54
156
  if (bottom > max_bottom) max_bottom = bottom;
55
157
  }
56
-
158
+
57
159
  if (max_top > max_bottom) return max_top;
58
160
  else return -max_bottom;
59
161
  }
60
- EOC
162
+ EOC
61
163
 
62
- end
63
164
  end
64
165
 
65
166
  class << self
66
- alias score fast_score_scale
167
+ alias score score_fitted_weight
168
+ alias score_weights score_custom_weights
67
169
  end
68
170
 
69
171
  def self.combine(up, down)
@@ -90,10 +192,9 @@ module RandomWalk
90
192
  if size == 0
91
193
  [0] * times
92
194
  else
93
- a = (0..total - 1).to_a
94
195
  (1..times).collect do
95
- a.shuffle!
96
- score(a[1..size].sort, total, missing).abs
196
+ p = Misc.random_sample_in_range(total, size)
197
+ score(p.sort, total, missing).abs
97
198
  end
98
199
  end
99
200
  end
@@ -121,6 +222,7 @@ module RandomWalk
121
222
  }
122
223
 
123
224
  def self.draw_hits(hits, total, filename = nil, options = {})
225
+ update = options[:update]
124
226
 
125
227
  size = options[:size] || total
126
228
  bg_color = options[:bg_color] || :white
@@ -128,14 +230,13 @@ module RandomWalk
128
230
  sections = options[:sections] || []
129
231
 
130
232
  size = [size, total].min
233
+ canvas = PNG::Canvas.new size, width, COLORS[bg_color] || PNG::Color.from(bg_color)
131
234
 
132
235
  hits = hits.collect{|h| h - 1}
133
236
  if size < total
134
237
  hits = hits.collect{|h| (h.to_f * size / total).to_i}
135
238
  end
136
239
 
137
- canvas = PNG::Canvas.new size, width, COLORS[bg_color] || PNG::Color.from(bg_color)
138
-
139
240
  sections.each{|color, info|
140
241
  start = info[0]
141
242
  finish = info[1]
@@ -161,11 +262,24 @@ module RandomWalk
161
262
  end
162
263
 
163
264
  module OrderedList
265
+ attr_accessor :weights, :total_weights
266
+
267
+ def self.setup(list, weights = nil, total_weights = nil)
268
+ list.extend OrderedList
269
+ list.weights = weights
270
+ if weights and total_weights.nil?
271
+ list.total_weights = Misc.sum(weights)
272
+ else
273
+ list.total_weights = total_weights
274
+ end
275
+ list
276
+ end
277
+
164
278
  def self.hits(list, set)
165
279
  set = Set.new(set) unless Set === set
166
280
  hits = []
167
281
  list.each_with_index do |e,i|
168
- hits << i if set.include? e
282
+ hits << i + 1 if set.include? e # count from 1
169
283
  end
170
284
  hits
171
285
  end
@@ -179,15 +293,144 @@ module OrderedList
179
293
  OrderedList.hits(self, set)
180
294
  end
181
295
 
296
+ def score(set)
297
+ hits = hits(set)
298
+ RandomWalk.score(hits.sort, self.length, 0)
299
+ end
300
+
301
+ def score_weights(set)
302
+ raise "No weight defined" if @weights.nil?
303
+ @total_weights ||= Misc.sum(@weights)
304
+ hits = hits(set)
305
+ RandomWalk.score_weights(hits.sort, @weights, @total_weights, self.length, 0)
306
+ end
307
+
308
+
182
309
  def draw_hits(set, filename = nil, options = {})
183
310
  OrderedList.draw_hits(self, set, filename, options)
184
311
  end
185
312
 
186
- def pvalue(set, options = {})
187
- options = Misc.add_defaults options, :permutations => 1000, :missing => 0
188
- hits = hits(set.compact)
189
- score = RandomWalk.score(hits.sort, self.length, 0)
190
- permutations = RandomWalk.permutations(set.length, self.length, options[:missing], options[:permutations])
191
- RandomWalk.pvalue(permutations, score)
313
+ #def pvalue(set, options = {})
314
+ # set = Set.new(set.compact) unless Set === set
315
+ # options = Misc.add_defaults options, :permutations => 10000, :missing => 0
316
+ # hits = hits(set)
317
+ # score = RandomWalk.score(hits.sort, self.length, 0)
318
+ # permutations = RandomWalk.permutations(set.length, self.length, options[:missing], options[:permutations])
319
+ # RandomWalk.pvalue(permutations, score)
320
+ #end
321
+
322
+ def pvalue(set, cutoff = 0.1, options = {})
323
+ set = Set.new(set.compact) unless Set === set
324
+ options = Misc.add_defaults options, :permutations => 10000, :missing => 0
325
+ permutations, missing = Misc.process_options options, :permutations, :missing
326
+
327
+ hits = hits(set)
328
+
329
+ return 1.0 if hits.empty?
330
+
331
+ target_score = RandomWalk.score(hits.sort, self.length, 0)
332
+ target_score_abs = target_score.abs
333
+
334
+ max = (permutations.to_f * cutoff).ceil
335
+
336
+ size = set.length
337
+ total = self.length
338
+ better_permutation_score_count = 1
339
+ if size == 0
340
+ 1.0
341
+ else
342
+ (1..permutations).each do
343
+ p= []
344
+ RandomWalk.sample_without_replacement(total, size, p)
345
+
346
+ permutation_score = RandomWalk.score(p.sort, total, missing).abs
347
+ if permutation_score.abs > target_score_abs
348
+ better_permutation_score_count += 1
349
+ end
350
+
351
+ return 1.0 if better_permutation_score_count > max
352
+ end
353
+ p = better_permutation_score_count.to_f / permutations
354
+ p = -p if target_score < 0
355
+ p
356
+ end
357
+ end
358
+
359
+ def pvalue_weights(set, cutoff = 0.1, options = {})
360
+ raise "No weight defined" if @weights.nil?
361
+ @total_weights ||= Misc.sum(@weights)
362
+
363
+ set = Set.new(set.compact) unless Set === set
364
+ options = Misc.add_defaults options, :permutations => 10000, :missing => 0
365
+ permutations, missing = Misc.process_options options, :permutations, :missing
366
+
367
+ hits = hits(set)
368
+
369
+ return 1.0 if hits.empty?
370
+
371
+ target_score = RandomWalk.score_weights(hits.sort, @weights, @total_weights, self.length, 0)
372
+ target_score_abs = target_score.abs
373
+
374
+ max = (permutations.to_f * cutoff).ceil
375
+
376
+ size = set.length
377
+ total = self.length
378
+ better_permutation_score_count = 1
379
+ if size == 0
380
+ 1.0
381
+ else
382
+ (1..permutations).each do
383
+ p= []
384
+ RandomWalk.sample_without_replacement(total, size, p)
385
+
386
+ permutation_score = RandomWalk.score_weights(p.sort, @weights, @total_weights, total, missing).abs
387
+ if permutation_score.abs > target_score_abs
388
+ better_permutation_score_count += 1
389
+ end
390
+
391
+ return 1.0 if better_permutation_score_count > max
392
+ end
393
+ p = better_permutation_score_count.to_f / permutations
394
+ p = -p if target_score < 0
395
+ p
396
+ end
397
+ end
398
+ end
399
+
400
+ module TSV
401
+
402
+ def self.rank_enrichment_for_list(list, hits, options = {})
403
+ cutoff = Misc.process_options options, :cutoff
404
+ list.extend OrderedList
405
+ if cutoff
406
+ list.pvalue(hits, cutoff, options)
407
+ else
408
+ list.pvalue(hits, options)
409
+ end
410
+ end
411
+
412
+ def self.rank_enrichment(tsv, list, options = {})
413
+ if tsv.fields
414
+ res = TSV.setup({}, :cast => :to_f, :type => :double, :key_field => tsv.key_field, :fields => ["p-value", tsv.fields.first])
415
+ else
416
+ res = TSV.setup({}, :cast => :to_f, :type => :double)
417
+ end
418
+
419
+ tsv.with_monitor do
420
+ tsv.with_unnamed do
421
+ tsv.through do |key, values|
422
+ pvalue = rank_enrichment_for_list(list, values, options)
423
+ res[key] = [pvalue, (values.respond_to?(:subset) ? values.subset(list) : values - list)]
424
+ end
425
+ end
426
+ end
427
+
428
+ FDR.adjust_hash! res, 0 if options[:fdr]
429
+
430
+ res
431
+ end
432
+
433
+ def rank_enrichment(list, options = {})
434
+ TSV.rank_enrichment(self, list, options)
192
435
  end
193
436
  end
@@ -6,7 +6,7 @@ require 'set'
6
6
 
7
7
 
8
8
  class TestNetwork < Test::Unit::TestCase
9
- def test_dijsktra
9
+ def _test_dijsktra
10
10
  string = STRING.protein_protein.tsv :persist => false, :fields => ["Interactor Ensembl Protein ID"], :type => :flat
11
11
  string.unnamed = true
12
12
 
@@ -22,11 +22,11 @@ class TestNetwork < Test::Unit::TestCase
22
22
 
23
23
  def test_weighted_dijsktra
24
24
  string = STRING.protein_protein.tsv
25
+ string.unnamed = true
25
26
 
26
27
  string.process "Score" do |scores|
27
28
  scores.collect{|score| 1000 - score.to_i}
28
29
  end
29
- string.unnamed = true
30
30
 
31
31
  start_node = "ENSP00000256078"
32
32
  end_node = "ENSP00000306245"
@@ -45,7 +45,7 @@ class TestNetwork < Test::Unit::TestCase
45
45
 
46
46
  end
47
47
 
48
- def test_random_weighted_dijsktra
48
+ def _test_random_weighted_dijsktra
49
49
  string = STRING.protein_protein.tsv
50
50
 
51
51
  string.process "Score" do |scores|
@@ -5,7 +5,7 @@ require 'test/unit'
5
5
  class TestHypergeometric < Test::Unit::TestCase
6
6
 
7
7
  def test_hypergeometric
8
- assert Hypergeometric.hypergeometric(100, 20, 15,13) < 0.05
8
+ assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
9
9
  end
10
10
 
11
11
  def test_annotation_counts
@@ -38,7 +38,29 @@ row7 A B Id3
38
38
  TmpFile.with_file(content) do |filename|
39
39
  tsv = TSV.open(filename, :sep => /\s+/)
40
40
 
41
- assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false).collect{|annot,pvalue| pvalue < 0.05 ? annot : nil}.compact
41
+ assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
42
42
  end
43
43
  end
44
+
45
+ def test_enrichement_with_background
46
+ content =<<-EOF
47
+ #Id ValueA ValueB OtherID
48
+ row1 a|aa|aaa b Id1|Id2
49
+ row2 A B Id3
50
+ row3 a C Id4
51
+ row4 a B Id3
52
+ row5 a B Id3
53
+ row6 A B Id3
54
+ row7 A B Id3
55
+ EOF
56
+
57
+ TmpFile.with_file(content) do |filename|
58
+ tsv = TSV.open(filename, :sep => /\s+/)
59
+
60
+ assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5 row6 row7)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
61
+ ddd tsv.enrichment(%w(row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5))
62
+ assert_equal %w(), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
63
+ end
64
+
65
+ end
44
66
  end
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/statistics/random_walk'
3
+ require 'test/unit'
4
+
5
+ class TestRandomWalk < Test::Unit::TestCase
6
+ def test_score_weight
7
+ list = (1..1000).to_a
8
+ list.extend OrderedList
9
+
10
+ weights = list.collect{|v| (Misc.mean(list) - v)**2}
11
+ weights_total = Misc.sum(weights)
12
+
13
+ assert RandomWalk.score_custom_weights((1..100).to_a, weights, weights_total, list.length, 0) >
14
+ RandomWalk.score_custom_weights([100, 200, 300, 400, 500], weights, weights_total, list.length, 0)
15
+ end
16
+
17
+ def test_pvalue
18
+ list = (1..1000).to_a
19
+ list.extend OrderedList
20
+
21
+ assert list.pvalue((1..100).to_a, 0.05) < 0.05
22
+ assert list.pvalue([100, 200, 300, 400, 500], 0.05) > 0.05
23
+ end
24
+
25
+ def test_pvalue_weights
26
+ list = (1..1000).to_a
27
+
28
+ weights = list.collect{|v| (Misc.mean(list) - v)**2}
29
+ weights_total = Misc.sum(weights)
30
+
31
+ OrderedList.setup(list, weights, weights_total)
32
+
33
+ assert list.pvalue_weights((1..100).to_a, 0.05) < 0.05
34
+ assert list.pvalue_weights([100, 200, 300, 400, 500], 0.05) > 0.05
35
+
36
+ end
37
+ end
38
+
39
+
data/test/test_helper.rb CHANGED
@@ -3,7 +3,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  $LOAD_PATH.unshift(File.dirname(__FILE__))
4
4
 
5
5
  class Test::Unit::TestCase
6
- def test_datafile(file)
6
+ def get_test_datafile(file)
7
7
  File.join(File.dirname(__FILE__), 'data', file)
8
8
  end
9
9
  end