rbbt-dm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/lib/rbbt/statistics/fdr.rb +132 -0
- data/lib/rbbt/statistics/hypergeometric.rb +146 -0
- data/test/rbbt/statistics/test_fdr.rb +41 -0
- data/test/rbbt/statistics/test_hypergeometric.rb +45 -0
- data/test/test_helper.rb +9 -0
- metadata +87 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2011 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'inline'
|
2
|
+
|
3
|
+
module FDR
|
4
|
+
|
5
|
+
# values should be sorted
|
6
|
+
def self.step_up_native(values, rate)
|
7
|
+
total = values.length
|
8
|
+
|
9
|
+
last = 0
|
10
|
+
values.each_with_index do |value, i|
|
11
|
+
if value > rate * (i + 1).to_f / total
|
12
|
+
return last
|
13
|
+
end
|
14
|
+
last = value
|
15
|
+
end
|
16
|
+
return last
|
17
|
+
end
|
18
|
+
|
19
|
+
# values should be sorted
|
20
|
+
def self.adjust_native(values)
|
21
|
+
total = values.length.to_f
|
22
|
+
|
23
|
+
adjusted = []
|
24
|
+
last = 1
|
25
|
+
values.reverse.each_with_index do |value, i|
|
26
|
+
adj = [last, value * total / (total - i )].min
|
27
|
+
last = adj
|
28
|
+
adjusted << adj
|
29
|
+
end
|
30
|
+
|
31
|
+
adjusted.reverse
|
32
|
+
end
|
33
|
+
|
34
|
+
class << self
|
35
|
+
inline do |builder|
|
36
|
+
builder.c <<-EOC
|
37
|
+
double step_up_fast(VALUE ps, double rate){
|
38
|
+
long idx;
|
39
|
+
int total = RARRAY(ps)->len;
|
40
|
+
|
41
|
+
double last_value = 0;
|
42
|
+
for (idx = 0; idx < total; idx++){
|
43
|
+
double p = (double) RFLOAT(rb_ary_entry(ps, idx))->value;
|
44
|
+
|
45
|
+
if (p > rate * (double) (idx + 1) / (double) total){
|
46
|
+
return last_value;
|
47
|
+
}
|
48
|
+
last_value = p;
|
49
|
+
}
|
50
|
+
|
51
|
+
return last_value;
|
52
|
+
}
|
53
|
+
|
54
|
+
EOC
|
55
|
+
|
56
|
+
|
57
|
+
builder.c <<-EOC
|
58
|
+
|
59
|
+
VALUE adjust_fast_self(VALUE ps){
|
60
|
+
long idx;
|
61
|
+
|
62
|
+
int total = RARRAY(ps)->len;
|
63
|
+
|
64
|
+
VALUE new = rb_ary_new();
|
65
|
+
|
66
|
+
double last = 1;
|
67
|
+
for (idx = total - 1; idx >= 0 ; idx-- ){
|
68
|
+
double p = (double) RFLOAT(rb_ary_entry(ps, idx))->value;
|
69
|
+
|
70
|
+
|
71
|
+
p = p * (double) total / (double) (idx + 1);
|
72
|
+
if (p > last) p = last;
|
73
|
+
last = p;
|
74
|
+
|
75
|
+
RFLOAT(rb_ary_entry(ps, idx))->value = p;
|
76
|
+
}
|
77
|
+
|
78
|
+
return ps;
|
79
|
+
}
|
80
|
+
EOC
|
81
|
+
|
82
|
+
builder.c <<-EOC
|
83
|
+
VALUE adjust_fast(VALUE ps){
|
84
|
+
long idx;
|
85
|
+
|
86
|
+
int total = RARRAY(ps)->len;
|
87
|
+
|
88
|
+
VALUE new = rb_ary_new();
|
89
|
+
|
90
|
+
double last = 1;
|
91
|
+
for (idx = total - 1; idx >= 0 ; idx-- ){
|
92
|
+
double p = (double) RFLOAT(rb_ary_entry(ps, idx))->value;
|
93
|
+
|
94
|
+
|
95
|
+
p = p * (double) total / (double) (idx + 1);
|
96
|
+
if (p > last) p = last;
|
97
|
+
last = p;
|
98
|
+
|
99
|
+
rb_ary_unshift(new,rb_float_new(p));
|
100
|
+
}
|
101
|
+
|
102
|
+
return new;
|
103
|
+
}
|
104
|
+
EOC
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
class << self
|
109
|
+
alias_method :adjust, :adjust_fast
|
110
|
+
alias_method :adjust!, :adjust_fast_self
|
111
|
+
alias_method :step_up, :step_up_fast
|
112
|
+
end
|
113
|
+
|
114
|
+
# This will change the values of the floats in situ
|
115
|
+
def self.adjust_hash!(data, field = nil)
|
116
|
+
keys = []
|
117
|
+
values = []
|
118
|
+
|
119
|
+
data.collect{|key, value| [key, field.nil? ? value : value[field]] }.sort{|a,b|
|
120
|
+
a[1] <=> b[1]
|
121
|
+
}.each{|p|
|
122
|
+
keys << p[0]
|
123
|
+
values << p[1]
|
124
|
+
}
|
125
|
+
|
126
|
+
FDR.adjust!(values)
|
127
|
+
|
128
|
+
data
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require 'inline'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
|
4
|
+
module Hypergeometric
|
5
|
+
class << self
|
6
|
+
inline do |builder|
|
7
|
+
builder.c_raw <<-EOC
|
8
|
+
/**
|
9
|
+
* Compute log(k!)
|
10
|
+
* @param k The value k.
|
11
|
+
* @return The result.
|
12
|
+
*/
|
13
|
+
double lFactorial(double k)
|
14
|
+
{
|
15
|
+
double r = 0;
|
16
|
+
int i;
|
17
|
+
for(i=2 ; i<=(int)k ; i++)
|
18
|
+
{
|
19
|
+
r = r + (double)(log((double)i));
|
20
|
+
}
|
21
|
+
return r;
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
/**
|
27
|
+
* Compute the log(binom(n,k))
|
28
|
+
* @param n The number of possible items.
|
29
|
+
* @param k The number of selected items.
|
30
|
+
* @return The result.
|
31
|
+
*/
|
32
|
+
double lBinom(double n, double k)
|
33
|
+
{
|
34
|
+
long i;
|
35
|
+
double r = 0;
|
36
|
+
|
37
|
+
if(n > n-k){
|
38
|
+
k = n-k;
|
39
|
+
}
|
40
|
+
|
41
|
+
for(i = (long)n ; i> (n-k) ; i--)
|
42
|
+
{
|
43
|
+
r = r + log((double)i);
|
44
|
+
}
|
45
|
+
|
46
|
+
r = r - lFactorial(k);
|
47
|
+
|
48
|
+
return r;
|
49
|
+
}
|
50
|
+
EOC
|
51
|
+
|
52
|
+
builder.c <<-EOC
|
53
|
+
/**
|
54
|
+
* * Compute the Hypergeometric accumulated value.
|
55
|
+
* * @param total => total size
|
56
|
+
* * @param support => total support
|
57
|
+
* * @param list => selected list size
|
58
|
+
* * @param found => support
|
59
|
+
* * @return The result
|
60
|
+
* */
|
61
|
+
double hypergeometric(double total, double support, double list, double found)
|
62
|
+
{
|
63
|
+
double other = total - support;
|
64
|
+
|
65
|
+
double top = list;
|
66
|
+
if(support < list){
|
67
|
+
top = support;
|
68
|
+
}
|
69
|
+
|
70
|
+
double log_n_choose_k = lBinom(total,list);
|
71
|
+
|
72
|
+
double lfoo = lBinom(support,top) + lBinom(other, list-top);
|
73
|
+
|
74
|
+
double sum = 0;
|
75
|
+
int i;
|
76
|
+
for (i = (int)top; i >= found; i-- )
|
77
|
+
{
|
78
|
+
sum = sum + exp(lfoo - log_n_choose_k);
|
79
|
+
if ( i > found)
|
80
|
+
{
|
81
|
+
lfoo = lfoo + log(i / (support - i+1)) + log( (other - list + i) / (list-i+1) );
|
82
|
+
}
|
83
|
+
}
|
84
|
+
return sum;
|
85
|
+
}
|
86
|
+
EOC
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class TSV
|
92
|
+
|
93
|
+
def annotation_counts(fields = nil)
|
94
|
+
fields ||= self.fields
|
95
|
+
fields = [fields] if String === fields or Symbol === fields
|
96
|
+
|
97
|
+
annotation_count_cache_file = TSV.get_persistence_file(File.basename(filename) + "_" + fields.inspect, File.expand_path(File.dirname(filename)))
|
98
|
+
|
99
|
+
if File.exists?(annotation_count_cache_file)
|
100
|
+
Log.low "Loading annotation counts from #{ annotation_count_cache_file }"
|
101
|
+
TCHash.get(annotation_count_cache_file)
|
102
|
+
else
|
103
|
+
Log.low "Saving annotation counts to #{ annotation_count_cache_file }"
|
104
|
+
hash = TCHash.get(annotation_count_cache_file)
|
105
|
+
|
106
|
+
counts = Hash.new(0)
|
107
|
+
through :main, fields do |key, values|
|
108
|
+
values.flatten.compact.uniq.each{|value| counts[value] += 1}
|
109
|
+
end
|
110
|
+
hash.merge! counts
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def enrichment(list, fields, options = {})
|
115
|
+
options = Misc.add_defaults options, :min_support => 3
|
116
|
+
Log.debug "Enrichment analysis of field #{fields.inspect} for #{list.length} entities"
|
117
|
+
selected = select :main => list
|
118
|
+
|
119
|
+
tsv_size = keys.length
|
120
|
+
total = selected.keys.length
|
121
|
+
Log.debug "Found #{total} of #{list.length} entities"
|
122
|
+
|
123
|
+
counts = annotation_counts fields
|
124
|
+
|
125
|
+
annotations = Hash.new 0
|
126
|
+
selected.through :main, fields do |key, values|
|
127
|
+
values.flatten.compact.uniq.each{|value| annotations[value] += 1}
|
128
|
+
end
|
129
|
+
|
130
|
+
pvalues = {}
|
131
|
+
annotations.each do |annotation, count|
|
132
|
+
Log.debug "Hypergeometric: #{ annotation } - #{[tsv_size, counts[annotation], total, count].inspect}"
|
133
|
+
next if count < options[:min_support]
|
134
|
+
pvalue = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
|
135
|
+
pvalues[annotation] = pvalue
|
136
|
+
end
|
137
|
+
|
138
|
+
FDR.adjust_hash! pvalues if options[:fdr]
|
139
|
+
pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
|
140
|
+
|
141
|
+
pvalues
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
require 'rbbt/statistics/fdr'
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rsruby'
|
5
|
+
|
6
|
+
class TestFDR < Test::Unit::TestCase
|
7
|
+
def clean(values)
|
8
|
+
if Array === values
|
9
|
+
values.collect{|v| (v * 10000).to_i.to_f / 10000}
|
10
|
+
else
|
11
|
+
(values * 10000).to_i.to_f / 10000
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def copy(values)
|
16
|
+
values.collect{|v| v + 0.0}
|
17
|
+
end
|
18
|
+
|
19
|
+
def setup
|
20
|
+
@r = RSRuby.instance
|
21
|
+
@values = [0.001, 0.002, 0.003, 0.003, 0.003, 0.004, 0.006, 0.07, 0.09]
|
22
|
+
@threshold = 0.01
|
23
|
+
@r_adj = @r.p_adjust(@values,'BH')
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_step_up
|
28
|
+
assert_equal(0.006, clean(FDR.step_up(@values, @threshold)))
|
29
|
+
assert_equal(clean(FDR.step_up_native(@values, @threshold)), clean(FDR.step_up_fast(@values,@threshold)))
|
30
|
+
assert_equal(@r_adj.select{|v| v <= @threshold}.length, @values.select{|v| v <= FDR.step_up(@values, @threshold)}.length)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_adjust
|
34
|
+
assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
|
35
|
+
assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
|
36
|
+
|
37
|
+
assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values))))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
require 'rbbt/statistics/hypergeometric'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestHypergeometric < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_hypergeometric
|
8
|
+
assert Hypergeometric.hypergeometric(100, 20, 15,13) < 0.05
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_annotation_counts
|
12
|
+
content =<<-EOF
|
13
|
+
#Id ValueA ValueB OtherID
|
14
|
+
row1 a|aa|aaa b Id1|Id2
|
15
|
+
row2 A B Id3
|
16
|
+
row3 a C Id4
|
17
|
+
EOF
|
18
|
+
|
19
|
+
TmpFile.with_file(content) do |filename|
|
20
|
+
tsv = TSV.new(filename + '#:sep=/\s+/')
|
21
|
+
counts = tsv.annotation_counts
|
22
|
+
assert_equal 2, counts['a']
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_enrichment
|
27
|
+
content =<<-EOF
|
28
|
+
#Id ValueA ValueB OtherID
|
29
|
+
row1 a|aa|aaa b Id1|Id2
|
30
|
+
row2 A B Id3
|
31
|
+
row3 a C Id4
|
32
|
+
row4 a B Id3
|
33
|
+
row5 a B Id3
|
34
|
+
row6 A B Id3
|
35
|
+
row7 A B Id3
|
36
|
+
EOF
|
37
|
+
|
38
|
+
TmpFile.with_file(content) do |filename|
|
39
|
+
tsv = TSV.new(filename + '#:sep=/\s+/')
|
40
|
+
|
41
|
+
assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA").collect{|annot,pvalue| pvalue < 0.05 ? annot : nil}.compact
|
42
|
+
assert_equal %w(aa aaa), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA").collect{|annot,pvalue| pvalue > 0.05 ? annot : nil}.compact
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rbbt-dm
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Miguel Vazquez
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-20 00:00:00 +01:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rbbt-util
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Data-mining and statistics
|
36
|
+
email: miguel.vazquez@fdi.ucm.es
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
files:
|
44
|
+
- LICENSE
|
45
|
+
- lib/rbbt/statistics/fdr.rb
|
46
|
+
- lib/rbbt/statistics/hypergeometric.rb
|
47
|
+
- test/rbbt/statistics/test_fdr.rb
|
48
|
+
- test/rbbt/statistics/test_hypergeometric.rb
|
49
|
+
- test/test_helper.rb
|
50
|
+
has_rdoc: true
|
51
|
+
homepage: http://github.com/mikisvaz/rbbt-phgx
|
52
|
+
licenses: []
|
53
|
+
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options: []
|
56
|
+
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.3.7
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Data-mining and statistics
|
84
|
+
test_files:
|
85
|
+
- test/rbbt/statistics/test_fdr.rb
|
86
|
+
- test/rbbt/statistics/test_hypergeometric.rb
|
87
|
+
- test/test_helper.rb
|