macroape 3.3.2 → 3.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -1
- data/Rakefile.rb +65 -0
- data/TODO.txt +20 -0
- data/benchmark/similarity_benchmark.rb +56 -0
- data/lib/macroape.rb +1 -2
- data/lib/macroape/aligned_pair_intersection.rb +43 -116
- data/lib/macroape/collection.rb +4 -4
- data/lib/macroape/{threshold_by_pvalue.rb → counting.rb} +28 -18
- data/lib/macroape/exec/eval_alignment.rb +19 -22
- data/lib/macroape/exec/eval_similarity.rb +13 -13
- data/lib/macroape/exec/find_pvalue.rb +7 -7
- data/lib/macroape/exec/find_threshold.rb +8 -8
- data/lib/macroape/exec/preprocess_collection.rb +8 -8
- data/lib/macroape/exec/scan_collection.rb +16 -16
- data/lib/macroape/pwm_compare.rb +2 -3
- data/lib/macroape/pwm_compare_aligned.rb +34 -26
- data/lib/macroape/version.rb +1 -1
- data/spec/count_distribution_spec.rb +52 -0
- data/spec/spec_helper.rb +4 -0
- data/test/eval_alignment_similarity_test.rb +1 -0
- data/test/eval_similarity_test.rb +1 -0
- data/test/find_pvalue_test.rb +1 -0
- data/test/find_threshold_test.rb +1 -0
- data/test/preprocess_collection_test.rb +1 -0
- data/test/scan_collection_test.rb +1 -0
- data/test/test_helper.rb +4 -4
- metadata +10 -5
- data/Rakefile +0 -28
- data/lib/macroape/count_by_threshold.rb +0 -16
data/.gitignore
CHANGED
data/Rakefile.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
namespace :spec do
|
6
|
+
task :find_threshold do
|
7
|
+
system("ruby -I ./test test/find_threshold_test.rb")
|
8
|
+
end
|
9
|
+
task :find_pvalue do
|
10
|
+
system("ruby -I ./test test/find_pvalue_test.rb")
|
11
|
+
end
|
12
|
+
task :eval_similarity do
|
13
|
+
system("ruby -I ./test test/eval_similarity_test.rb")
|
14
|
+
end
|
15
|
+
task :eval_alignment_similarity do
|
16
|
+
system("ruby -I ./test test/eval_alignment_similarity_test.rb")
|
17
|
+
end
|
18
|
+
task :preprocess_collection do
|
19
|
+
system("ruby -I ./test test/preprocess_collection_test.rb")
|
20
|
+
end
|
21
|
+
task :scan_collection do
|
22
|
+
system("ruby -I ./test test/scan_collection_test.rb")
|
23
|
+
end
|
24
|
+
task :tests => [:find_threshold, :find_pvalue, :eval_similarity,
|
25
|
+
:eval_alignment_similarity, :scan_collection, :preprocess_collection]
|
26
|
+
|
27
|
+
RSpec::Core::RakeTask.new
|
28
|
+
end
|
29
|
+
|
30
|
+
desc 'Test all functionality of gem executables'
|
31
|
+
task :spec => ['spec:tests', 'spec:spec']
|
32
|
+
|
33
|
+
namespace :benchmark do
|
34
|
+
task :run do
|
35
|
+
require 'open3'
|
36
|
+
time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
|
37
|
+
File.open('benchmark/benchmark.log','a') do |f|
|
38
|
+
f.puts "=========================================================\n#{time}\n"
|
39
|
+
Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
|
40
|
+
Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
|
41
|
+
benchmark_name = File.basename(benchmark_filename)
|
42
|
+
out_str = out.read
|
43
|
+
err_str = err.read
|
44
|
+
|
45
|
+
benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
|
46
|
+
benchmark_infos_to_file = benchmark_infos
|
47
|
+
puts benchmark_infos
|
48
|
+
|
49
|
+
if err_str && !err_str.empty?
|
50
|
+
STDERR.puts(err_str)
|
51
|
+
benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
|
52
|
+
end
|
53
|
+
|
54
|
+
# add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
|
55
|
+
f.puts benchmark_infos_to_file
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
task :show do
|
61
|
+
puts File.read('benchmark/benchmark.log')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
task :benchmark => 'benchmark:run'
|
data/TODO.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Absolutely necessary:
|
2
|
+
Repair obtaining matrix not only from files but from stdin
|
3
|
+
Make it available to load collections in preprocess_collection from single file (and from stdin of certainly)
|
4
|
+
Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
|
5
|
+
|
6
|
+
Specs and tests:
|
7
|
+
create spec on use of MaxHashSize, MaxHashSizeDouble
|
8
|
+
create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
|
9
|
+
create test for getting PWMs from stdin
|
10
|
+
create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
|
11
|
+
|
12
|
+
Ideas to inctrease perfomance:
|
13
|
+
- Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions
|
14
|
+
- (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
|
15
|
+
- Create JAVA extension for alignment_intersection methods in order to increase perfomance
|
16
|
+
- Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
|
17
|
+
|
18
|
+
Usability issues:
|
19
|
+
review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)
|
20
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(__FILE__),'../lib')
|
4
|
+
require 'macroape'
|
5
|
+
|
6
|
+
class TaskToBenchmark
|
7
|
+
def setup
|
8
|
+
@matrix_first = "KLF4_f2.xml
|
9
|
+
0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
|
10
|
+
-1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
|
11
|
+
-2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
|
12
|
+
-2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
|
13
|
+
-0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
|
14
|
+
-1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
|
15
|
+
-2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
|
16
|
+
-1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
|
17
|
+
-2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
|
18
|
+
-1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
|
19
|
+
"
|
20
|
+
|
21
|
+
@matrix_second = "> SP1_f1
|
22
|
+
-0.24435707885585334 -0.6748234046937317 0.8657012535789861 -1.1060188862599292
|
23
|
+
-1.0631255752097801 -2.1119259694238686 1.0960627561110399 -0.6138563775211981
|
24
|
+
-0.387227623476054 -2.973985191321805 1.1807800242010371 -4.338927525031567
|
25
|
+
-4.563896055436894 -2.916163300253228 1.3684371349982631 -5.077972423609655
|
26
|
+
-2.2369752892820087 -3.719643631330185 1.3510439136452728 -4.8899306705082335
|
27
|
+
-0.07473964149330914 0.9449196547620103 -2.624685764808605 -0.851098348782244
|
28
|
+
-1.9643526491643326 -2.9784027708801153 1.3113096718240569 -2.3243342594990253
|
29
|
+
-4.015548413965584 -3.138426807809667 1.338748858978805 -2.0846739035376483
|
30
|
+
-0.4450938582835542 -2.2510053061629707 1.126543157436868 -1.7780413702431377
|
31
|
+
-1.1896356092245055 -1.2251832285630033 1.163676006374752 -1.6080243648157357
|
32
|
+
-0.5166047365590577 0.7641033353626651 -0.28626775700282125 -0.6825482097865606"
|
33
|
+
|
34
|
+
@pvalue = 0.0005
|
35
|
+
@discretization = 10
|
36
|
+
@first_background, @second_background = [1,1,1,1], [1,1,1,1]
|
37
|
+
|
38
|
+
@pwm_first = Bioinform::PWM.new(@matrix_first).background(@first_background).discrete(@discretization)
|
39
|
+
@pwm_second = Bioinform::PWM.new(@matrix_second).background(@second_background).discrete(@discretization)
|
40
|
+
@cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def run
|
45
|
+
first_threshold = @pwm_first.threshold(@pvalue)
|
46
|
+
second_threshold = @pwm_second.threshold(@pvalue)
|
47
|
+
info = @cmp.jaccard(first_threshold, second_threshold)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
benchmark_result = 10.times.collect do
|
52
|
+
task_to_benchmark = TaskToBenchmark.new.setup
|
53
|
+
Benchmark.measure{ task_to_benchmark.run }
|
54
|
+
end.inject(&:+)
|
55
|
+
|
56
|
+
puts benchmark_result
|
data/lib/macroape.rb
CHANGED
@@ -1,136 +1,63 @@
|
|
1
1
|
module Macroape
|
2
2
|
class PWMCompareAligned
|
3
|
-
|
3
|
+
|
4
|
+
# unoptimized version of this and related methods
|
4
5
|
def counts_for_two_matrices(threshold_first, threshold_second)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
# just not to call method each time
|
7
|
+
first_background = first.background
|
8
|
+
second_background = second.background
|
9
|
+
unless first_background == second_background
|
10
|
+
first_result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
|
11
|
+
second_result = get_counts(threshold_first, threshold_second) {|score,letter| second_background[letter] * score }
|
12
|
+
return [first_result, second_result]
|
13
|
+
end
|
14
|
+
if first.background == [1,1,1,1]
|
15
|
+
result = get_counts(threshold_first, threshold_second) {|score,letter| score}
|
16
|
+
[result, result]
|
11
17
|
else
|
12
|
-
|
18
|
+
result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
|
19
|
+
[result, result]
|
13
20
|
end
|
14
21
|
end
|
15
22
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
scores.each do |score_first, second_scores|
|
30
|
-
second_scores.each do |score_second, count|
|
31
|
-
4.times do |letter|
|
32
|
-
new_score_first = score_first + first.matrix[column][letter]
|
33
|
-
if new_score_first >= already_enough_first
|
34
|
-
new_score_second = score_second + second.matrix[column][letter]
|
35
|
-
if new_score_second >= already_enough_second
|
36
|
-
result_first += count[0] * first.background[letter] * ending_weight_first
|
37
|
-
result_second += count[1] * second.background[letter] * ending_weight_second
|
38
|
-
elsif new_score_second >= least_sufficient_second
|
39
|
-
new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
|
40
|
-
new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
|
41
|
-
end
|
42
|
-
elsif new_score_first >= least_sufficient_first
|
43
|
-
new_score_second = score_second + second.matrix[column][letter]
|
44
|
-
if new_score_second >= least_sufficient_second
|
45
|
-
new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
|
46
|
-
new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
23
|
+
|
24
|
+
# block has form: {|score,letter| contribution to count by `letter` with `score` }
|
25
|
+
def get_counts(threshold_first, threshold_second, &count_contribution_block)
|
26
|
+
# scores_on_first_pwm, scores_on_second_pwm --> count
|
27
|
+
scores = { 0 => {0 => 1} }
|
28
|
+
length.times do |column|
|
29
|
+
new_scores = recalc_score_hash(scores,
|
30
|
+
@first.matrix[column], @second.matrix[column],
|
31
|
+
threshold_first - first.best_suffix(column + 1),
|
32
|
+
threshold_second - second.best_suffix(column + 1), &count_contribution_block)
|
33
|
+
scores.replace(new_scores)
|
34
|
+
if defined?(MaxHashSizeDouble) && scores.inject(0){|sum,hsh|sum + hsh.size} > MaxHashSizeDouble
|
35
|
+
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
|
51
36
|
end
|
52
|
-
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
|
53
|
-
scores = new_scores
|
54
37
|
end
|
55
|
-
|
38
|
+
scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
|
56
39
|
end
|
57
|
-
|
58
|
-
def counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
|
59
|
-
scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
|
60
|
-
result = 0.0
|
61
|
-
background = first.background
|
62
|
-
length.times do |column|
|
63
|
-
ending_weight = first.background_sum ** (length - column - 1)
|
64
|
-
already_enough_first = threshold_first - first.worst_suffix[column + 1]
|
65
|
-
already_enough_second = threshold_second - second.worst_suffix[column + 1]
|
66
|
-
least_sufficient_first = threshold_first - first.best_suffix[column + 1]
|
67
|
-
least_sufficient_second = threshold_second - second.best_suffix[column + 1]
|
68
40
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if new_score_first >= already_enough_first
|
75
|
-
new_score_second = score_second + second.matrix[column][letter]
|
76
|
-
if new_score_second >= already_enough_second
|
77
|
-
result += count * background[letter] * ending_weight
|
78
|
-
elsif new_score_second >= least_sufficient_second
|
79
|
-
new_scores[new_score_first][new_score_second] += count * background[letter]
|
80
|
-
end
|
81
|
-
elsif new_score_first >= least_sufficient_first
|
82
|
-
new_score_second = score_second + second.matrix[column][letter]
|
83
|
-
if new_score_second >= least_sufficient_second
|
84
|
-
new_scores[new_score_first][new_score_second] += count * background[letter]
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
90
|
-
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_same_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
|
91
|
-
scores = new_scores
|
92
|
-
end
|
93
|
-
[result, result]
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
def common_words_for_two_matrices(threshold_first, threshold_second)
|
98
|
-
scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
|
99
|
-
result = 0
|
100
|
-
length.times do |column|
|
101
|
-
ending_weight = 4 ** (length - column - 1)
|
102
|
-
already_enough_first = threshold_first - first.worst_suffix[column + 1]
|
103
|
-
already_enough_second = threshold_second - second.worst_suffix[column + 1]
|
104
|
-
least_sufficient_first = threshold_first - first.best_suffix[column + 1]
|
105
|
-
least_sufficient_second = threshold_second - second.best_suffix[column + 1]
|
41
|
+
# wouldn't work without count_contribution_block
|
42
|
+
def recalc_score_hash(scores, first_column, second_column, least_sufficient_first, least_sufficient_second)
|
43
|
+
new_scores = Hash.new{|h,k| h[k] = Hash.new(0)}
|
44
|
+
scores.each do |score_first, second_scores|
|
45
|
+
second_scores.each do |score_second, count|
|
106
46
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
new_score_second = score_second + second.matrix[column][letter]
|
114
|
-
if new_score_second >= already_enough_second
|
115
|
-
result += count * ending_weight
|
116
|
-
elsif new_score_second >= least_sufficient_second
|
117
|
-
new_scores[new_score_first][new_score_second] += count
|
118
|
-
end
|
119
|
-
elsif new_score_first >= least_sufficient_first
|
120
|
-
new_score_second = score_second + second.matrix[column][letter]
|
121
|
-
if new_score_second >= least_sufficient_second
|
122
|
-
new_scores[new_score_first][new_score_second] += count
|
123
|
-
end
|
47
|
+
4.times do |letter|
|
48
|
+
new_score_first = score_first + first_column[letter]
|
49
|
+
if new_score_first >= least_sufficient_first
|
50
|
+
new_score_second = score_second + second_column[letter]
|
51
|
+
if new_score_second >= least_sufficient_second
|
52
|
+
new_scores[new_score_first][new_score_second] += yield(count, letter)
|
124
53
|
end
|
125
54
|
end
|
126
55
|
end
|
56
|
+
|
127
57
|
end
|
128
|
-
|
129
|
-
raise 'Hash overflow in Macroape::AlignedPairIntersection#common_words_for_two_matrices' if defined? MaxHashSizeDouble and new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
|
130
|
-
scores = new_scores
|
131
58
|
end
|
132
|
-
|
59
|
+
new_scores
|
133
60
|
end
|
134
|
-
|
61
|
+
|
135
62
|
end
|
136
63
|
end
|
data/lib/macroape/collection.rb
CHANGED
@@ -11,10 +11,10 @@ module Macroape
|
|
11
11
|
@infos[pwm.name] = info
|
12
12
|
end
|
13
13
|
def ==(other)
|
14
|
-
@rough_discretization == other.rough_discretization &&
|
15
|
-
@precise_discretization == other.precise_discretization &&
|
16
|
-
@background == other.background &&
|
17
|
-
@pvalues == other.pvalues &&
|
14
|
+
@rough_discretization == other.rough_discretization &&
|
15
|
+
@precise_discretization == other.precise_discretization &&
|
16
|
+
@background == other.background &&
|
17
|
+
@pvalues == other.pvalues &&
|
18
18
|
@pwms == other.pwms &&
|
19
19
|
@infos == other.infos
|
20
20
|
end
|
@@ -3,7 +3,7 @@ module Bioinform
|
|
3
3
|
def threshold(pvalue)
|
4
4
|
thresholds(pvalue){|_, thresh, _| return thresh }
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
def thresholds(*pvalues)
|
8
8
|
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
9
9
|
threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
|
@@ -11,31 +11,30 @@ module Bioinform
|
|
11
11
|
yield pvalue, threshold, real_pvalue
|
12
12
|
end
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def count_distribution_under_pvalue(max_pvalue)
|
16
|
-
|
16
|
+
cnt_distribution = {}
|
17
17
|
look_for_count = max_pvalue * vocabulary_volume
|
18
|
-
until
|
19
|
-
|
18
|
+
until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
|
19
|
+
cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
|
20
20
|
max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
|
21
21
|
end
|
22
|
-
|
23
|
-
|
22
|
+
|
23
|
+
cnt_distribution
|
24
24
|
end
|
25
|
-
|
26
|
-
|
25
|
+
|
26
|
+
|
27
27
|
# ret-value: hash {pvalue => [thresholds, counts]}
|
28
28
|
# thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
|
29
29
|
# counts = left_count .. right_count (left_count > right_count)
|
30
30
|
def thresholds_by_pvalues(*pvalues)
|
31
|
-
|
32
|
-
sorted_scores = count_distribution.sort.reverse
|
31
|
+
sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
|
33
32
|
scores = sorted_scores.map{|score,count| score}
|
34
33
|
counts = sorted_scores.map{|score,count| count}
|
35
34
|
partial_sums = counts.partial_sums
|
36
|
-
|
35
|
+
|
37
36
|
results = {}
|
38
|
-
|
37
|
+
|
39
38
|
pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
|
40
39
|
pvalue_counts.map do |pvalue,look_for_count|
|
41
40
|
ind = partial_sums.index{|sum| sum >= look_for_count}
|
@@ -46,18 +45,19 @@ module Bioinform
|
|
46
45
|
|
47
46
|
results
|
48
47
|
end
|
49
|
-
|
48
|
+
|
50
49
|
def count_distribution_after_threshold(threshold)
|
50
|
+
return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
|
51
51
|
scores = { 0 => 1 }
|
52
52
|
length.times do |column|
|
53
|
-
scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix
|
53
|
+
scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
|
54
54
|
raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if defined? MaxHashSizeSingle and scores.size > MaxHashSizeSingle
|
55
55
|
end
|
56
56
|
scores
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
59
|
def count_distribution
|
60
|
-
count_distribution_after_threshold(worst_score)
|
60
|
+
@count_distribution ||= count_distribution_after_threshold(worst_score)
|
61
61
|
end
|
62
62
|
|
63
63
|
def recalc_score_hash(scores, column, least_sufficient)
|
@@ -72,6 +72,16 @@ module Bioinform
|
|
72
72
|
end
|
73
73
|
new_scores
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
|
+
def counts_by_thresholds(*thresholds)
|
77
|
+
scores = count_distribution_after_threshold(thresholds.min)
|
78
|
+
thresholds.map{ |threshold|
|
79
|
+
scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def pvalue_by_threshold(threshold)
|
84
|
+
counts_by_thresholds(threshold).first / vocabulary_volume
|
85
|
+
end
|
76
86
|
end
|
77
87
|
end
|