macroape 4.0.2 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,130 +1,139 @@
1
- require 'bioinform/support/parameters'
2
- require_relative 'aligned_pair_intersection'
3
-
4
- module Macroape
5
- class PWMCompareAligned
6
- include Bioinform::Parameters
7
- # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
8
- make_parameters :max_pair_hash_size
9
-
10
- attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
11
-
12
- def initialize(first_unaligned, second_unaligned, shift, orientation)
13
- @parameters = OpenStruct.new
14
- @shift, @orientation = shift, orientation
15
-
16
- @first_length, @second_length = first_unaligned.length, second_unaligned.length
17
- @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
18
-
19
- first, second = first_unaligned, second_unaligned
20
- second = second.reverse_complement if revcomp?
21
-
22
- if shift > 0
23
- second = second.left_augment(shift)
24
- else
25
- first = first.left_augment(-shift)
26
- end
27
-
28
- @first = first.right_augment(@length - first.length)
29
- @second = second.right_augment(@length - second.length)
30
- end
31
-
32
- def direct?
33
- orientation == :direct
34
- end
35
- def revcomp?
36
- orientation == :revcomp
37
- end
38
-
39
- def overlap
40
- length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
41
- end
42
-
43
- def first_pwm_alignment
44
- length.times.map do |pos|
45
- if first_overlaps?(pos)
46
- '>'
47
- else
48
- '.'
49
- end
50
- end.join
51
- end
52
-
53
- def second_pwm_alignment
54
- length.times.map do |pos|
55
- if second_overlaps?(pos)
56
- direct? ? '>' : '<'
57
- else
58
- '.'
59
- end
60
- end.join
61
- end
62
-
63
- def alignment_infos
64
- {shift: shift,
65
- orientation: orientation,
66
- text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
67
- overlap: overlap,
68
- alignment_length: length}
69
- end
70
-
71
- # whether first matrix overlap specified position of alignment
72
- def first_overlaps?(pos)
73
- return false unless pos >= 0 && pos < length
74
- if shift > 0
75
- pos < first_length
76
- else
77
- pos >= -shift && pos < -shift + first_length
78
- end
79
- end
80
-
81
- def second_overlaps?(pos)
82
- return false unless pos >= 0 && pos < length
83
- if shift > 0
84
- pos >= shift && pos < shift + second_length
85
- else
86
- pos < second_length
87
- end
88
- end
89
-
90
- def jaccard(first_threshold, second_threshold)
91
- f = first.count_by_threshold(first_threshold)
92
- s = second.count_by_threshold(second_threshold)
93
- if f == 0 || s == 0
94
- return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
95
- recognized_by_first: f,
96
- recognized_by_second: s,
97
- }
98
- end
99
-
100
- intersect = counts_for_two_matrices(first_threshold, second_threshold)
101
- intersect = Math.sqrt(intersect[0] * intersect[1])
102
- union = f + s - intersect
103
- similarity = intersect.to_f / union
104
- { similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
105
- recognized_by_first: f, recognized_by_second: s,
106
- real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
107
- end
108
-
109
- def jaccard_by_pvalue(pvalue)
110
- threshold_first = first.threshold(pvalue)
111
- threshold_second = second.threshold(pvalue)
112
- jaccard(threshold_first, threshold_second)
113
- end
114
-
115
- def jaccard_by_weak_pvalue(pvalue)
116
- threshold_first = first.weak_threshold(pvalue)
117
- threshold_second = second.weak_threshold(pvalue)
118
- jaccard(threshold_first, threshold_second)
119
- end
120
-
121
- def self.calculate_alignment_length(first_len, second_len, shift)
122
- if shift > 0
123
- [first_len, second_len + shift].max
124
- else
125
- [first_len - shift, second_len].max
126
- end
127
- end
128
- end
129
-
130
- end
1
+ require_relative 'aligned_pair_intersection'
2
+
3
+ module Macroape
4
+ class PWMCounting
5
+ def left_augmented(n)
6
+ PWMCounting.new(pwm.left_augmented(n), background: background, max_hash_size: max_hash_size)
7
+ end
8
+ def right_augmented(n)
9
+ PWMCounting.new(pwm.right_augmented(n), background: background, max_hash_size: max_hash_size)
10
+ end
11
+ def reverse_complemented
12
+ PWMCounting.new(pwm.reverse_complemented, background: background, max_hash_size: max_hash_size)
13
+ end
14
+ end
15
+
16
+ class PWMCompareAligned
17
+ # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
18
+ attr_accessor :max_pair_hash_size
19
+
20
+ attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
21
+
22
+ # first_unaligned and second_unaligned - PWMCounting objects, not PWMs
23
+ def initialize(first_unaligned, second_unaligned, shift, orientation)
24
+ @shift, @orientation = shift, orientation
25
+
26
+ @first_length, @second_length = first_unaligned.length, second_unaligned.length
27
+ @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
28
+
29
+ first, second = first_unaligned, second_unaligned
30
+ second = second.reverse_complemented if revcomp?
31
+
32
+ if shift > 0
33
+ second = second.left_augmented(shift)
34
+ else
35
+ first = first.left_augmented(-shift)
36
+ end
37
+
38
+ @first = first.right_augmented(@length - first.length)
39
+ @second = second.right_augmented(@length - second.length)
40
+ end
41
+
42
+ def direct?
43
+ orientation == :direct
44
+ end
45
+ def revcomp?
46
+ orientation == :revcomp
47
+ end
48
+
49
+ def overlap
50
+ length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
51
+ end
52
+
53
+ def first_pwm_alignment
54
+ length.times.map do |pos|
55
+ if first_overlaps?(pos)
56
+ '>'
57
+ else
58
+ '.'
59
+ end
60
+ end.join
61
+ end
62
+
63
+ def second_pwm_alignment
64
+ length.times.map do |pos|
65
+ if second_overlaps?(pos)
66
+ direct? ? '>' : '<'
67
+ else
68
+ '.'
69
+ end
70
+ end.join
71
+ end
72
+
73
+ def alignment_infos
74
+ {shift: shift,
75
+ orientation: orientation,
76
+ text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
77
+ overlap: overlap,
78
+ alignment_length: length}
79
+ end
80
+
81
+ # whether first matrix overlap specified position of alignment
82
+ def first_overlaps?(pos)
83
+ return false unless pos >= 0 && pos < length
84
+ if shift > 0
85
+ pos < first_length
86
+ else
87
+ pos >= -shift && pos < -shift + first_length
88
+ end
89
+ end
90
+
91
+ def second_overlaps?(pos)
92
+ return false unless pos >= 0 && pos < length
93
+ if shift > 0
94
+ pos >= shift && pos < shift + second_length
95
+ else
96
+ pos < second_length
97
+ end
98
+ end
99
+
100
+ def jaccard(first_threshold, second_threshold)
101
+ f = first.count_by_threshold(first_threshold)
102
+ s = second.count_by_threshold(second_threshold)
103
+ if f == 0 || s == 0
104
+ return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
105
+ recognized_by_first: f,
106
+ recognized_by_second: s,
107
+ }
108
+ end
109
+
110
+ intersect = counts_for_two_matrices(first_threshold, second_threshold)
111
+ intersect = Math.sqrt(intersect[0] * intersect[1])
112
+ union = f + s - intersect
113
+ similarity = intersect.to_f / union
114
+ { similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
115
+ recognized_by_first: f, recognized_by_second: s,
116
+ real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
117
+ end
118
+
119
+ def jaccard_by_pvalue(pvalue)
120
+ threshold_first = first.threshold(pvalue)
121
+ threshold_second = second.threshold(pvalue)
122
+ jaccard(threshold_first, threshold_second)
123
+ end
124
+
125
+ def jaccard_by_weak_pvalue(pvalue)
126
+ threshold_first = first.weak_threshold(pvalue)
127
+ threshold_second = second.weak_threshold(pvalue)
128
+ jaccard(threshold_first, threshold_second)
129
+ end
130
+
131
+ def self.calculate_alignment_length(first_len, second_len, shift)
132
+ if shift > 0
133
+ [first_len, second_len + shift].max
134
+ else
135
+ [first_len - shift, second_len].max
136
+ end
137
+ end
138
+ end
139
+ end
@@ -1,121 +1,175 @@
1
- require 'bioinform'
2
-
3
- module Bioinform
4
- class PWM
5
- # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
6
- make_parameters :max_hash_size
7
-
8
- def threshold(pvalue)
9
- thresholds(pvalue){|_, thresh, _| return thresh }
10
- end
11
- def threshold_and_real_pvalue(pvalue)
12
- thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
13
- end
14
- def weak_threshold(pvalue)
15
- weak_thresholds(pvalue){|_, thresh, _| return thresh }
16
- end
17
- def weak_threshold_and_real_pvalue(pvalue)
18
- weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
19
- end
20
-
21
- def thresholds(*pvalues)
22
- thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
23
- threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
24
- real_pvalue = counts.end.to_f / vocabulary_volume
25
- yield pvalue, threshold, real_pvalue
26
- end
27
- end
28
-
29
- # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
30
- def weak_thresholds(*pvalues)
31
- thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
32
- threshold = thresholds.begin.to_f
33
- real_pvalue = counts.begin.to_f / vocabulary_volume
34
- yield pvalue, threshold, real_pvalue
35
- end
36
- end
37
-
38
-
39
- def count_distribution_under_pvalue(max_pvalue)
40
- cnt_distribution = {}
41
- look_for_count = max_pvalue * vocabulary_volume
42
- until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
43
- begin
44
- approximate_threshold = threshold_gauss_estimation(max_pvalue)
45
- rescue
46
- approximate_threshold = worst_score
47
- end
48
- cnt_distribution = count_distribution_after_threshold(approximate_threshold)
49
- max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
50
- end
51
-
52
- cnt_distribution
53
- end
54
-
55
-
56
- # ret-value: hash {pvalue => [thresholds, counts]}
57
- # thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
58
- # counts = left_count .. right_count (left_count > right_count)
59
- def thresholds_by_pvalues(*pvalues)
60
- sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
61
- scores = sorted_scores.map{|score,count| score}
62
- counts = sorted_scores.map{|score,count| count}
63
- partial_sums = counts.partial_sums
64
-
65
- results = {}
66
-
67
- pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
68
- pvalue_counts.map do |pvalue,look_for_count|
69
- ind = partial_sums.index{|sum| sum >= look_for_count}
70
- minscore, count_at_minscore = scores[ind], partial_sums[ind]
71
- maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
72
- results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
73
- end
74
-
75
- results
76
- end
77
-
78
- def count_distribution_after_threshold(threshold)
79
- return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
80
- scores = { 0 => 1 }
81
- length.times do |column|
82
- scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
83
- raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
84
- end
85
- scores
86
- end
87
-
88
- def count_distribution
89
- @count_distribution ||= count_distribution_after_threshold(worst_score)
90
- end
91
-
92
- def recalc_score_hash(scores, column, least_sufficient)
93
- new_scores = Hash.new(0)
94
- scores.each do |score, count|
95
- 4.times do |letter|
96
- new_score = score + column[letter]
97
- if new_score >= least_sufficient
98
- new_scores[new_score] += count * background[letter]
99
- end
100
- end
101
- end
102
- new_scores
103
- end
104
-
105
- def counts_by_thresholds(*thresholds)
106
- scores = count_distribution_after_threshold(thresholds.min)
107
- thresholds.inject({}){ |hsh, threshold|
108
- hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
109
- hsh
110
- }
111
- end
112
-
113
- def count_by_threshold(threshold)
114
- counts_by_thresholds(threshold)[threshold]
115
- end
116
-
117
- def pvalue_by_threshold(threshold)
118
- count_by_threshold(threshold) / vocabulary_volume
119
- end
120
- end
121
- end
1
+ require 'bioinform'
2
+ require_relative 'support/inverf'
3
+ require_relative 'support/partial_sums'
4
+
5
+ module Macroape
6
+ class PWMCounting
7
+ attr_accessor :pwm, :max_hash_size, :background
8
+
9
+ def initialize(pwm, background: Bioinform::Background::Wordwise, max_hash_size: nil)
10
+ @pwm = pwm
11
+ @background = background
12
+ @max_hash_size = max_hash_size
13
+ end
14
+
15
+ def matrix
16
+ pwm.matrix
17
+ end
18
+
19
+ def vocabulary_volume
20
+ background.volume ** length
21
+ end
22
+
23
+ def threshold_gauss_estimation(max_pvalue)
24
+ pwm.threshold_gauss_estimation(max_pvalue)
25
+ end
26
+
27
+ def length
28
+ pwm.length
29
+ end
30
+
31
+ def best_score
32
+ best_suffix(0)
33
+ end
34
+
35
+ def worst_score
36
+ worst_suffix(0)
37
+ end
38
+
39
+ # best score of suffix s[i..l]
40
+ def best_suffix(i)
41
+ matrix[i...length].map(&:max).inject(0.0, &:+)
42
+ end
43
+
44
+ def worst_suffix(i)
45
+ matrix[i...length].map(&:min).inject(0.0, &:+)
46
+ end
47
+
48
+ def score_mean
49
+ pwm.each_position.inject(0.0){|mean, position| mean + background.mean(position) }
50
+ end
51
+
52
+ def score_variance
53
+ pwm.each_position.inject(0.0){|variance, position| variance + background.mean_square(position) - background.mean(position) **2 }
54
+ end
55
+
56
+ def threshold_gauss_estimation(pvalue)
57
+ sigma = Math.sqrt(score_variance)
58
+ n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
59
+ score_mean + n_ * sigma
60
+ end
61
+
62
+ def threshold(pvalue)
63
+ thresholds(pvalue){|_, thresh, _| return thresh }
64
+ end
65
+ def threshold_and_real_pvalue(pvalue)
66
+ thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
67
+ end
68
+ def weak_threshold(pvalue)
69
+ weak_thresholds(pvalue){|_, thresh, _| return thresh }
70
+ end
71
+ def weak_threshold_and_real_pvalue(pvalue)
72
+ weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
73
+ end
74
+
75
+ def thresholds(*pvalues)
76
+ thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
77
+ threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
78
+ real_pvalue = counts.end.to_f / vocabulary_volume
79
+ yield pvalue, threshold, real_pvalue
80
+ end
81
+ end
82
+
83
+ # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
84
+ def weak_thresholds(*pvalues)
85
+ thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
86
+ threshold = thresholds.begin.to_f
87
+ real_pvalue = counts.begin.to_f / vocabulary_volume
88
+ yield pvalue, threshold, real_pvalue
89
+ end
90
+ end
91
+
92
+
93
+ def count_distribution_under_pvalue(max_pvalue)
94
+ cnt_distribution = {}
95
+ look_for_count = max_pvalue * vocabulary_volume
96
+ until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
97
+ begin
98
+ approximate_threshold = threshold_gauss_estimation(max_pvalue)
99
+ rescue
100
+ approximate_threshold = worst_score
101
+ end
102
+ cnt_distribution = count_distribution_after_threshold(approximate_threshold)
103
+ max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
104
+ end
105
+
106
+ cnt_distribution
107
+ end
108
+
109
+
110
+ # ret-value: hash {pvalue => [thresholds, counts]}
111
+ # thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
112
+ # counts = left_count .. right_count (left_count > right_count)
113
+ def thresholds_by_pvalues(*pvalues)
114
+ sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
115
+ scores = sorted_scores.map{|score,count| score}
116
+ counts = sorted_scores.map{|score,count| count}
117
+ partial_sums = counts.partial_sums
118
+
119
+ results = {}
120
+
121
+ pvalue_counts = pvalues.sort.each_with_object({}){|pvalue, hsh| hsh[pvalue] = pvalue * vocabulary_volume }
122
+ pvalue_counts.map do |pvalue,look_for_count|
123
+ ind = partial_sums.index{|sum| sum >= look_for_count}
124
+ minscore, count_at_minscore = scores[ind], partial_sums[ind]
125
+ maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
126
+ results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
127
+ end
128
+
129
+ results
130
+ end
131
+
132
+ def count_distribution_after_threshold(threshold)
133
+ return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
134
+ scores = { 0 => 1 }
135
+ length.times do |column|
136
+ scores.replace recalc_score_hash(scores, matrix[column], threshold - best_suffix(column + 1))
137
+ raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
138
+ end
139
+ scores
140
+ end
141
+
142
+ def count_distribution
143
+ @count_distribution ||= count_distribution_after_threshold(worst_score)
144
+ end
145
+
146
+ def recalc_score_hash(scores, column, least_sufficient)
147
+ new_scores = Hash.new(0)
148
+ scores.each do |score, count|
149
+ 4.times do |letter|
150
+ new_score = score + column[letter]
151
+ if new_score >= least_sufficient
152
+ new_scores[new_score] += count * background.counts[letter]
153
+ end
154
+ end
155
+ end
156
+ new_scores
157
+ end
158
+
159
+ def counts_by_thresholds(*thresholds)
160
+ scores = count_distribution_after_threshold(thresholds.min)
161
+ thresholds.inject({}){ |hsh, threshold|
162
+ hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
163
+ hsh
164
+ }
165
+ end
166
+
167
+ def count_by_threshold(threshold)
168
+ counts_by_thresholds(threshold)[threshold]
169
+ end
170
+
171
+ def pvalue_by_threshold(threshold)
172
+ count_by_threshold(threshold) / vocabulary_volume
173
+ end
174
+ end
175
+ end