macroape 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,130 +1,139 @@
1
- require 'bioinform/support/parameters'
2
- require_relative 'aligned_pair_intersection'
3
-
4
- module Macroape
5
- class PWMCompareAligned
6
- include Bioinform::Parameters
7
- # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
8
- make_parameters :max_pair_hash_size
9
-
10
- attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
11
-
12
- def initialize(first_unaligned, second_unaligned, shift, orientation)
13
- @parameters = OpenStruct.new
14
- @shift, @orientation = shift, orientation
15
-
16
- @first_length, @second_length = first_unaligned.length, second_unaligned.length
17
- @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
18
-
19
- first, second = first_unaligned, second_unaligned
20
- second = second.reverse_complement if revcomp?
21
-
22
- if shift > 0
23
- second = second.left_augment(shift)
24
- else
25
- first = first.left_augment(-shift)
26
- end
27
-
28
- @first = first.right_augment(@length - first.length)
29
- @second = second.right_augment(@length - second.length)
30
- end
31
-
32
- def direct?
33
- orientation == :direct
34
- end
35
- def revcomp?
36
- orientation == :revcomp
37
- end
38
-
39
- def overlap
40
- length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
41
- end
42
-
43
- def first_pwm_alignment
44
- length.times.map do |pos|
45
- if first_overlaps?(pos)
46
- '>'
47
- else
48
- '.'
49
- end
50
- end.join
51
- end
52
-
53
- def second_pwm_alignment
54
- length.times.map do |pos|
55
- if second_overlaps?(pos)
56
- direct? ? '>' : '<'
57
- else
58
- '.'
59
- end
60
- end.join
61
- end
62
-
63
- def alignment_infos
64
- {shift: shift,
65
- orientation: orientation,
66
- text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
67
- overlap: overlap,
68
- alignment_length: length}
69
- end
70
-
71
- # whether first matrix overlap specified position of alignment
72
- def first_overlaps?(pos)
73
- return false unless pos >= 0 && pos < length
74
- if shift > 0
75
- pos < first_length
76
- else
77
- pos >= -shift && pos < -shift + first_length
78
- end
79
- end
80
-
81
- def second_overlaps?(pos)
82
- return false unless pos >= 0 && pos < length
83
- if shift > 0
84
- pos >= shift && pos < shift + second_length
85
- else
86
- pos < second_length
87
- end
88
- end
89
-
90
- def jaccard(first_threshold, second_threshold)
91
- f = first.count_by_threshold(first_threshold)
92
- s = second.count_by_threshold(second_threshold)
93
- if f == 0 || s == 0
94
- return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
95
- recognized_by_first: f,
96
- recognized_by_second: s,
97
- }
98
- end
99
-
100
- intersect = counts_for_two_matrices(first_threshold, second_threshold)
101
- intersect = Math.sqrt(intersect[0] * intersect[1])
102
- union = f + s - intersect
103
- similarity = intersect.to_f / union
104
- { similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
105
- recognized_by_first: f, recognized_by_second: s,
106
- real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
107
- end
108
-
109
- def jaccard_by_pvalue(pvalue)
110
- threshold_first = first.threshold(pvalue)
111
- threshold_second = second.threshold(pvalue)
112
- jaccard(threshold_first, threshold_second)
113
- end
114
-
115
- def jaccard_by_weak_pvalue(pvalue)
116
- threshold_first = first.weak_threshold(pvalue)
117
- threshold_second = second.weak_threshold(pvalue)
118
- jaccard(threshold_first, threshold_second)
119
- end
120
-
121
- def self.calculate_alignment_length(first_len, second_len, shift)
122
- if shift > 0
123
- [first_len, second_len + shift].max
124
- else
125
- [first_len - shift, second_len].max
126
- end
127
- end
128
- end
129
-
130
- end
1
+ require_relative 'aligned_pair_intersection'
2
+
3
+ module Macroape
4
+ class PWMCounting
5
+ def left_augmented(n)
6
+ PWMCounting.new(pwm.left_augmented(n), background: background, max_hash_size: max_hash_size)
7
+ end
8
+ def right_augmented(n)
9
+ PWMCounting.new(pwm.right_augmented(n), background: background, max_hash_size: max_hash_size)
10
+ end
11
+ def reverse_complemented
12
+ PWMCounting.new(pwm.reverse_complemented, background: background, max_hash_size: max_hash_size)
13
+ end
14
+ end
15
+
16
+ class PWMCompareAligned
17
+ # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
18
+ attr_accessor :max_pair_hash_size
19
+
20
+ attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
21
+
22
+ # first_unaligned and second_unaligned - PWMCounting objects, not PWMs
23
+ def initialize(first_unaligned, second_unaligned, shift, orientation)
24
+ @shift, @orientation = shift, orientation
25
+
26
+ @first_length, @second_length = first_unaligned.length, second_unaligned.length
27
+ @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
28
+
29
+ first, second = first_unaligned, second_unaligned
30
+ second = second.reverse_complemented if revcomp?
31
+
32
+ if shift > 0
33
+ second = second.left_augmented(shift)
34
+ else
35
+ first = first.left_augmented(-shift)
36
+ end
37
+
38
+ @first = first.right_augmented(@length - first.length)
39
+ @second = second.right_augmented(@length - second.length)
40
+ end
41
+
42
+ def direct?
43
+ orientation == :direct
44
+ end
45
+ def revcomp?
46
+ orientation == :revcomp
47
+ end
48
+
49
+ def overlap
50
+ length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
51
+ end
52
+
53
+ def first_pwm_alignment
54
+ length.times.map do |pos|
55
+ if first_overlaps?(pos)
56
+ '>'
57
+ else
58
+ '.'
59
+ end
60
+ end.join
61
+ end
62
+
63
+ def second_pwm_alignment
64
+ length.times.map do |pos|
65
+ if second_overlaps?(pos)
66
+ direct? ? '>' : '<'
67
+ else
68
+ '.'
69
+ end
70
+ end.join
71
+ end
72
+
73
+ def alignment_infos
74
+ {shift: shift,
75
+ orientation: orientation,
76
+ text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
77
+ overlap: overlap,
78
+ alignment_length: length}
79
+ end
80
+
81
+ # whether first matrix overlap specified position of alignment
82
+ def first_overlaps?(pos)
83
+ return false unless pos >= 0 && pos < length
84
+ if shift > 0
85
+ pos < first_length
86
+ else
87
+ pos >= -shift && pos < -shift + first_length
88
+ end
89
+ end
90
+
91
+ def second_overlaps?(pos)
92
+ return false unless pos >= 0 && pos < length
93
+ if shift > 0
94
+ pos >= shift && pos < shift + second_length
95
+ else
96
+ pos < second_length
97
+ end
98
+ end
99
+
100
+ def jaccard(first_threshold, second_threshold)
101
+ f = first.count_by_threshold(first_threshold)
102
+ s = second.count_by_threshold(second_threshold)
103
+ if f == 0 || s == 0
104
+ return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
105
+ recognized_by_first: f,
106
+ recognized_by_second: s,
107
+ }
108
+ end
109
+
110
+ intersect = counts_for_two_matrices(first_threshold, second_threshold)
111
+ intersect = Math.sqrt(intersect[0] * intersect[1])
112
+ union = f + s - intersect
113
+ similarity = intersect.to_f / union
114
+ { similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
115
+ recognized_by_first: f, recognized_by_second: s,
116
+ real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
117
+ end
118
+
119
+ def jaccard_by_pvalue(pvalue)
120
+ threshold_first = first.threshold(pvalue)
121
+ threshold_second = second.threshold(pvalue)
122
+ jaccard(threshold_first, threshold_second)
123
+ end
124
+
125
+ def jaccard_by_weak_pvalue(pvalue)
126
+ threshold_first = first.weak_threshold(pvalue)
127
+ threshold_second = second.weak_threshold(pvalue)
128
+ jaccard(threshold_first, threshold_second)
129
+ end
130
+
131
+ def self.calculate_alignment_length(first_len, second_len, shift)
132
+ if shift > 0
133
+ [first_len, second_len + shift].max
134
+ else
135
+ [first_len - shift, second_len].max
136
+ end
137
+ end
138
+ end
139
+ end
@@ -1,121 +1,175 @@
1
- require 'bioinform'
2
-
3
- module Bioinform
4
- class PWM
5
- # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
6
- make_parameters :max_hash_size
7
-
8
- def threshold(pvalue)
9
- thresholds(pvalue){|_, thresh, _| return thresh }
10
- end
11
- def threshold_and_real_pvalue(pvalue)
12
- thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
13
- end
14
- def weak_threshold(pvalue)
15
- weak_thresholds(pvalue){|_, thresh, _| return thresh }
16
- end
17
- def weak_threshold_and_real_pvalue(pvalue)
18
- weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
19
- end
20
-
21
- def thresholds(*pvalues)
22
- thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
23
- threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
24
- real_pvalue = counts.end.to_f / vocabulary_volume
25
- yield pvalue, threshold, real_pvalue
26
- end
27
- end
28
-
29
- # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
30
- def weak_thresholds(*pvalues)
31
- thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
32
- threshold = thresholds.begin.to_f
33
- real_pvalue = counts.begin.to_f / vocabulary_volume
34
- yield pvalue, threshold, real_pvalue
35
- end
36
- end
37
-
38
-
39
- def count_distribution_under_pvalue(max_pvalue)
40
- cnt_distribution = {}
41
- look_for_count = max_pvalue * vocabulary_volume
42
- until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
43
- begin
44
- approximate_threshold = threshold_gauss_estimation(max_pvalue)
45
- rescue
46
- approximate_threshold = worst_score
47
- end
48
- cnt_distribution = count_distribution_after_threshold(approximate_threshold)
49
- max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
50
- end
51
-
52
- cnt_distribution
53
- end
54
-
55
-
56
- # ret-value: hash {pvalue => [thresholds, counts]}
57
- # thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
58
- # counts = left_count .. right_count (left_count > right_count)
59
- def thresholds_by_pvalues(*pvalues)
60
- sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
61
- scores = sorted_scores.map{|score,count| score}
62
- counts = sorted_scores.map{|score,count| count}
63
- partial_sums = counts.partial_sums
64
-
65
- results = {}
66
-
67
- pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
68
- pvalue_counts.map do |pvalue,look_for_count|
69
- ind = partial_sums.index{|sum| sum >= look_for_count}
70
- minscore, count_at_minscore = scores[ind], partial_sums[ind]
71
- maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
72
- results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
73
- end
74
-
75
- results
76
- end
77
-
78
- def count_distribution_after_threshold(threshold)
79
- return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
80
- scores = { 0 => 1 }
81
- length.times do |column|
82
- scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
83
- raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
84
- end
85
- scores
86
- end
87
-
88
- def count_distribution
89
- @count_distribution ||= count_distribution_after_threshold(worst_score)
90
- end
91
-
92
- def recalc_score_hash(scores, column, least_sufficient)
93
- new_scores = Hash.new(0)
94
- scores.each do |score, count|
95
- 4.times do |letter|
96
- new_score = score + column[letter]
97
- if new_score >= least_sufficient
98
- new_scores[new_score] += count * background[letter]
99
- end
100
- end
101
- end
102
- new_scores
103
- end
104
-
105
- def counts_by_thresholds(*thresholds)
106
- scores = count_distribution_after_threshold(thresholds.min)
107
- thresholds.inject({}){ |hsh, threshold|
108
- hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
109
- hsh
110
- }
111
- end
112
-
113
- def count_by_threshold(threshold)
114
- counts_by_thresholds(threshold)[threshold]
115
- end
116
-
117
- def pvalue_by_threshold(threshold)
118
- count_by_threshold(threshold) / vocabulary_volume
119
- end
120
- end
121
- end
1
+ require 'bioinform'
2
+ require_relative 'support/inverf'
3
+ require_relative 'support/partial_sums'
4
+
5
+ module Macroape
6
+ class PWMCounting
7
+ attr_accessor :pwm, :max_hash_size, :background
8
+
9
+ def initialize(pwm, background: Bioinform::Background::Wordwise, max_hash_size: nil)
10
+ @pwm = pwm
11
+ @background = background
12
+ @max_hash_size = max_hash_size
13
+ end
14
+
15
+ def matrix
16
+ pwm.matrix
17
+ end
18
+
19
+ def vocabulary_volume
20
+ background.volume ** length
21
+ end
22
+
23
+ def threshold_gauss_estimation(max_pvalue)
24
+ pwm.threshold_gauss_estimation(max_pvalue)
25
+ end
26
+
27
+ def length
28
+ pwm.length
29
+ end
30
+
31
+ def best_score
32
+ best_suffix(0)
33
+ end
34
+
35
+ def worst_score
36
+ worst_suffix(0)
37
+ end
38
+
39
+ # best score of suffix s[i..l]
40
+ def best_suffix(i)
41
+ matrix[i...length].map(&:max).inject(0.0, &:+)
42
+ end
43
+
44
+ def worst_suffix(i)
45
+ matrix[i...length].map(&:min).inject(0.0, &:+)
46
+ end
47
+
48
+ def score_mean
49
+ pwm.each_position.inject(0.0){|mean, position| mean + background.mean(position) }
50
+ end
51
+
52
+ def score_variance
53
+ pwm.each_position.inject(0.0){|variance, position| variance + background.mean_square(position) - background.mean(position) **2 }
54
+ end
55
+
56
+ def threshold_gauss_estimation(pvalue)
57
+ sigma = Math.sqrt(score_variance)
58
+ n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
59
+ score_mean + n_ * sigma
60
+ end
61
+
62
+ def threshold(pvalue)
63
+ thresholds(pvalue){|_, thresh, _| return thresh }
64
+ end
65
+ def threshold_and_real_pvalue(pvalue)
66
+ thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
67
+ end
68
+ def weak_threshold(pvalue)
69
+ weak_thresholds(pvalue){|_, thresh, _| return thresh }
70
+ end
71
+ def weak_threshold_and_real_pvalue(pvalue)
72
+ weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
73
+ end
74
+
75
+ def thresholds(*pvalues)
76
+ thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
77
+ threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
78
+ real_pvalue = counts.end.to_f / vocabulary_volume
79
+ yield pvalue, threshold, real_pvalue
80
+ end
81
+ end
82
+
83
+ # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
84
+ def weak_thresholds(*pvalues)
85
+ thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
86
+ threshold = thresholds.begin.to_f
87
+ real_pvalue = counts.begin.to_f / vocabulary_volume
88
+ yield pvalue, threshold, real_pvalue
89
+ end
90
+ end
91
+
92
+
93
+ def count_distribution_under_pvalue(max_pvalue)
94
+ cnt_distribution = {}
95
+ look_for_count = max_pvalue * vocabulary_volume
96
+ until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
97
+ begin
98
+ approximate_threshold = threshold_gauss_estimation(max_pvalue)
99
+ rescue
100
+ approximate_threshold = worst_score
101
+ end
102
+ cnt_distribution = count_distribution_after_threshold(approximate_threshold)
103
+ max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
104
+ end
105
+
106
+ cnt_distribution
107
+ end
108
+
109
+
110
+ # ret-value: hash {pvalue => [thresholds, counts]}
111
+ # thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
112
+ # counts = left_count .. right_count (left_count > right_count)
113
+ def thresholds_by_pvalues(*pvalues)
114
+ sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
115
+ scores = sorted_scores.map{|score,count| score}
116
+ counts = sorted_scores.map{|score,count| count}
117
+ partial_sums = counts.partial_sums
118
+
119
+ results = {}
120
+
121
+ pvalue_counts = pvalues.sort.each_with_object({}){|pvalue, hsh| hsh[pvalue] = pvalue * vocabulary_volume }
122
+ pvalue_counts.map do |pvalue,look_for_count|
123
+ ind = partial_sums.index{|sum| sum >= look_for_count}
124
+ minscore, count_at_minscore = scores[ind], partial_sums[ind]
125
+ maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
126
+ results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
127
+ end
128
+
129
+ results
130
+ end
131
+
132
+ def count_distribution_after_threshold(threshold)
133
+ return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
134
+ scores = { 0 => 1 }
135
+ length.times do |column|
136
+ scores.replace recalc_score_hash(scores, matrix[column], threshold - best_suffix(column + 1))
137
+ raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
138
+ end
139
+ scores
140
+ end
141
+
142
+ def count_distribution
143
+ @count_distribution ||= count_distribution_after_threshold(worst_score)
144
+ end
145
+
146
+ def recalc_score_hash(scores, column, least_sufficient)
147
+ new_scores = Hash.new(0)
148
+ scores.each do |score, count|
149
+ 4.times do |letter|
150
+ new_score = score + column[letter]
151
+ if new_score >= least_sufficient
152
+ new_scores[new_score] += count * background.counts[letter]
153
+ end
154
+ end
155
+ end
156
+ new_scores
157
+ end
158
+
159
+ def counts_by_thresholds(*thresholds)
160
+ scores = count_distribution_after_threshold(thresholds.min)
161
+ thresholds.inject({}){ |hsh, threshold|
162
+ hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
163
+ hsh
164
+ }
165
+ end
166
+
167
+ def count_by_threshold(threshold)
168
+ counts_by_thresholds(threshold)[threshold]
169
+ end
170
+
171
+ def pvalue_by_threshold(threshold)
172
+ count_by_threshold(threshold) / vocabulary_volume
173
+ end
174
+ end
175
+ end