semtools 0.1.2 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/bin/onto2json.rb +9 -3
- data/bin/semtools.rb +446 -0
- data/bin/strsimnet.rb +1 -2
- data/external_data/ontologies.txt +4 -0
- data/lib/semtools/math_methods.rb +137 -129
- data/lib/semtools/ontology.rb +2550 -2032
- data/lib/semtools/sim_handler.rb +1 -1
- data/lib/semtools/version.rb +1 -1
- data/lib/semtools.rb +0 -1
- data/semtools.gemspec +3 -0
- metadata +48 -3
@@ -1,140 +1,148 @@
|
|
1
|
-
# TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
|
2
|
-
#to cmpute fisher exact test
|
3
|
-
#Fisher => http://www.biostathandbook.com/fishers.html
|
4
|
-
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
|
21
|
-
end
|
22
|
-
if tail == 'two_sided'
|
23
|
-
accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
24
|
-
elsif tail == 'less'
|
25
|
-
accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
26
|
-
end
|
27
|
-
return accumulated_prob
|
28
|
-
end
|
1
|
+
# # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
|
2
|
+
# #to cmpute fisher exact test
|
3
|
+
# #Fisher => http://www.biostathandbook.com/fishers.html
|
4
|
+
# def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
|
5
|
+
# #puts '-', listA.inspect, listB.inspect, '-'
|
6
|
+
# listA_listB = listA & listB
|
7
|
+
# listA_nolistB = listA - listB
|
8
|
+
# nolistA_listB = listB - listA
|
9
|
+
# if weigths.nil?
|
10
|
+
# listA_listB_count = listA_listB.length
|
11
|
+
# listA_nolistB_count = listA_nolistB.length
|
12
|
+
# nolistA_listB_count = nolistA_listB.length
|
13
|
+
# nolistA_nolistB_count = all_elements_count - (listA | listB).length
|
14
|
+
# else
|
15
|
+
# # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
|
16
|
+
# # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
|
17
|
+
# listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
18
|
+
# listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
19
|
+
# nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
29
20
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
nolistA_listB_count + n,
|
47
|
-
nolistA_nolistB_count - n,
|
48
|
-
all_elements_count
|
49
|
-
)
|
50
|
-
prob <= ref_prob ? accumulated_prob += prob : break
|
51
|
-
end
|
21
|
+
# if partial_weigths
|
22
|
+
# nolistA_nolistB_count = all_elements_count - (listA | listB).length
|
23
|
+
# all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
|
24
|
+
# else
|
25
|
+
# nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
26
|
+
# all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
|
30
|
+
# if tail == 'two_sided'
|
31
|
+
# accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
32
|
+
# elsif tail == 'less'
|
33
|
+
# accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
34
|
+
# end
|
35
|
+
# return accumulated_prob
|
36
|
+
# end
|
52
37
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
38
|
+
# def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
39
|
+
# #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
|
40
|
+
# accumulated_prob = 0
|
41
|
+
# ref_prob = compute_hyper_prob(
|
42
|
+
# listA_listB_count,
|
43
|
+
# listA_nolistB_count,
|
44
|
+
# nolistA_listB_count,
|
45
|
+
# nolistA_nolistB_count,
|
46
|
+
# all_elements_count
|
47
|
+
# )
|
48
|
+
# accumulated_prob += ref_prob
|
49
|
+
# [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
|
50
|
+
# n += 1
|
51
|
+
# prob = compute_hyper_prob(
|
52
|
+
# listA_listB_count - n,
|
53
|
+
# listA_nolistB_count + n,
|
54
|
+
# nolistA_listB_count + n,
|
55
|
+
# nolistA_nolistB_count - n,
|
56
|
+
# all_elements_count
|
57
|
+
# )
|
58
|
+
# prob <= ref_prob ? accumulated_prob += prob : break
|
59
|
+
# end
|
64
60
|
|
65
|
-
|
66
|
-
|
61
|
+
# [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
|
62
|
+
# n += 1
|
63
|
+
# prob = compute_hyper_prob(
|
64
|
+
# listA_listB_count + n,
|
65
|
+
# listA_nolistB_count - n,
|
66
|
+
# nolistA_listB_count - n,
|
67
|
+
# nolistA_nolistB_count + n,
|
68
|
+
# all_elements_count
|
69
|
+
# )
|
70
|
+
# accumulated_prob += prob if prob <= ref_prob
|
71
|
+
# end
|
67
72
|
|
68
|
-
|
69
|
-
|
70
|
-
[listA_listB_count, nolistA_nolistB_count].min.times do |n|
|
71
|
-
accumulated_prob += compute_hyper_prob(
|
72
|
-
listA_listB_count - n,
|
73
|
-
listA_nolistB_count + n,
|
74
|
-
nolistA_listB_count + n,
|
75
|
-
nolistA_nolistB_count - n,
|
76
|
-
all_elements_count
|
77
|
-
)
|
78
|
-
end
|
79
|
-
return accumulated_prob
|
80
|
-
end
|
73
|
+
# return accumulated_prob
|
74
|
+
# end
|
81
75
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
76
|
+
# def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
77
|
+
# accumulated_prob = 0
|
78
|
+
# [listA_listB_count, nolistA_nolistB_count].min.times do |n|
|
79
|
+
# accumulated_prob += compute_hyper_prob(
|
80
|
+
# listA_listB_count - n,
|
81
|
+
# listA_nolistB_count + n,
|
82
|
+
# nolistA_listB_count + n,
|
83
|
+
# nolistA_nolistB_count - n,
|
84
|
+
# all_elements_count
|
85
|
+
# )
|
86
|
+
# end
|
87
|
+
# return accumulated_prob
|
88
|
+
# end
|
89
89
|
|
90
|
-
def
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
90
|
+
# def compute_hyper_prob(a, b, c, d, n)
|
91
|
+
# # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
|
92
|
+
# binomA = binom(a + b, a)
|
93
|
+
# binomC = binom(c + d, c)
|
94
|
+
# divisor = binom(n, a + c)
|
95
|
+
# return (binomA * binomC).fdiv(divisor)
|
96
|
+
# end
|
97
97
|
|
98
|
-
#
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
|
106
|
-
end
|
107
|
-
arr_ro = order(arr_o)
|
108
|
-
arr_cummin = cummin(arr_cummin_input)
|
109
|
-
arr_pmin = pmin(arr_cummin)
|
110
|
-
return arr_pmin.values_at(*arr_ro)
|
111
|
-
end
|
98
|
+
# def binom(n,k)
|
99
|
+
# if k > 0 && k < n
|
100
|
+
# res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
|
101
|
+
# else
|
102
|
+
# res = 1
|
103
|
+
# end
|
104
|
+
# end
|
112
105
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
106
|
+
# #to cmpute adjusted pvalues
|
107
|
+
# #https://rosettacode.org/wiki/P-value_correction#Ruby
|
108
|
+
# def get_benjaminiHochberg_pvalues(arr_pvalues)
|
109
|
+
# n = arr_pvalues.length
|
110
|
+
# arr_o = order(arr_pvalues, true)
|
111
|
+
# arr_cummin_input = []
|
112
|
+
# (0..(n - 1)).each do |i|
|
113
|
+
# arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
|
114
|
+
# end
|
115
|
+
# arr_ro = order(arr_o)
|
116
|
+
# arr_cummin = cummin(arr_cummin_input)
|
117
|
+
# arr_pmin = pmin(arr_cummin)
|
118
|
+
# return arr_pmin.values_at(*arr_ro)
|
119
|
+
# end
|
120
120
|
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
return arr_cummin
|
129
|
-
end
|
121
|
+
# def order(array, decreasing = false)
|
122
|
+
# if decreasing == false
|
123
|
+
# array.sort.map { |n| array.index(n) }
|
124
|
+
# else
|
125
|
+
# array.sort.map { |n| array.index(n) }.reverse
|
126
|
+
# end
|
127
|
+
# end
|
130
128
|
|
131
|
-
def
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
end
|
129
|
+
# def cummin(array)
|
130
|
+
# cumulative_min = array.first
|
131
|
+
# arr_cummin = []
|
132
|
+
# array.each do |p|
|
133
|
+
# cumulative_min = [p, cumulative_min].min
|
134
|
+
# arr_cummin << cumulative_min
|
135
|
+
# end
|
136
|
+
# return arr_cummin
|
137
|
+
# end
|
138
|
+
|
139
|
+
# def pmin(array)
|
140
|
+
# x = 1
|
141
|
+
# pmin_array = []
|
142
|
+
# array.each_index do |i|
|
143
|
+
# pmin_array[i] = [array[i], x].min
|
144
|
+
# abort if pmin_array[i] > 1
|
145
|
+
# end
|
146
|
+
# return pmin_array
|
147
|
+
# end
|
140
148
|
|