macroape 4.0.2 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +22 -22
- data/README.md +70 -70
- data/Rakefile.rb +49 -49
- data/TODO.txt +46 -46
- data/benchmark/benchmark_helper.rb +4 -4
- data/benchmark/similarity_benchmark.rb +52 -52
- data/bin/align_motifs +4 -4
- data/bin/eval_alignment +4 -4
- data/bin/eval_similarity +4 -4
- data/bin/find_pvalue +4 -4
- data/bin/find_threshold +4 -4
- data/bin/preprocess_collection +4 -4
- data/bin/scan_collection +4 -4
- data/lib/macroape.rb +14 -11
- data/lib/macroape/aligned_pair_intersection.rb +61 -62
- data/lib/macroape/cli.rb +191 -188
- data/lib/macroape/cli/align_motifs.rb +120 -100
- data/lib/macroape/cli/eval_alignment.rb +157 -156
- data/lib/macroape/cli/eval_similarity.rb +138 -137
- data/lib/macroape/cli/find_pvalue.rb +93 -87
- data/lib/macroape/cli/find_threshold.rb +103 -96
- data/lib/macroape/cli/preprocess_collection.rb +169 -161
- data/lib/macroape/cli/scan_collection.rb +171 -163
- data/lib/macroape/collection.rb +29 -0
- data/lib/macroape/motif_with_thresholds.rb +18 -0
- data/lib/macroape/pwm_compare.rb +39 -44
- data/lib/macroape/pwm_compare_aligned.rb +139 -130
- data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
- data/lib/macroape/support/inverf.rb +13 -0
- data/lib/macroape/support/partial_sums.rb +17 -0
- data/lib/macroape/version.rb +4 -4
- data/macroape.gemspec +19 -19
- data/spec/count_distribution_spec.rb +112 -109
- data/spec/inverf_spec.rb +23 -0
- data/spec/partial_sums_spec.rb +28 -0
- data/spec/spec_helper.rb +11 -11
- data/test/align_motifs_test.rb +42 -43
- data/test/data/AHR_si.pwm +10 -10
- data/test/data/KLF3_f1.pcm +16 -16
- data/test/data/KLF3_f1.pwm +16 -16
- data/test/data/KLF4_f2.pcm +11 -11
- data/test/data/KLF4_f2.pwm +11 -11
- data/test/data/KLF4_f2_scan_results_all.txt +2 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
- data/test/data/SP1_f1.pcm +12 -12
- data/test/data/SP1_f1.pwm +12 -12
- data/test/data/SP1_f1_revcomp.pcm +12 -12
- data/test/data/SP1_f1_revcomp.pwm +12 -12
- data/test/data/medium_motif.pwm +8 -8
- data/test/data/short_motif.pwm +7 -7
- data/test/data/test_collection.yaml +231 -214
- data/test/data/test_collection/GABPA_f1.pwm +14 -14
- data/test/data/test_collection/KLF4_f2.pwm +10 -10
- data/test/data/test_collection/SP1_f1.pwm +12 -12
- data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
- data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
- data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
- data/test/data/test_collection_single_file.txt +38 -38
- data/test/data/test_collection_single_file_pcm.txt +37 -37
- data/test/data/test_collection_weak.yaml +231 -214
- data/test/eval_alignment_test.rb +90 -111
- data/test/eval_similarity_test.rb +105 -123
- data/test/find_pvalue_test.rb +34 -39
- data/test/find_threshold_test.rb +87 -91
- data/test/preprocess_collection_test.rb +56 -65
- data/test/scan_collection_test.rb +42 -48
- data/test/test_helper.rb +159 -160
- metadata +14 -10
- data/test/data/collection_pcm_without_thresholds.yaml +0 -188
- data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,130 +1,139 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
def
|
54
|
-
length.times.map do |pos|
|
55
|
-
if
|
56
|
-
|
57
|
-
else
|
58
|
-
'.'
|
59
|
-
end
|
60
|
-
end.join
|
61
|
-
end
|
62
|
-
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
1
|
+
require_relative 'aligned_pair_intersection'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
class PWMCounting
|
5
|
+
def left_augmented(n)
|
6
|
+
PWMCounting.new(pwm.left_augmented(n), background: background, max_hash_size: max_hash_size)
|
7
|
+
end
|
8
|
+
def right_augmented(n)
|
9
|
+
PWMCounting.new(pwm.right_augmented(n), background: background, max_hash_size: max_hash_size)
|
10
|
+
end
|
11
|
+
def reverse_complemented
|
12
|
+
PWMCounting.new(pwm.reverse_complemented, background: background, max_hash_size: max_hash_size)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class PWMCompareAligned
|
17
|
+
# sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
18
|
+
attr_accessor :max_pair_hash_size
|
19
|
+
|
20
|
+
attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
|
21
|
+
|
22
|
+
# first_unaligned and second_unaligned - PWMCounting objects, not PWMs
|
23
|
+
def initialize(first_unaligned, second_unaligned, shift, orientation)
|
24
|
+
@shift, @orientation = shift, orientation
|
25
|
+
|
26
|
+
@first_length, @second_length = first_unaligned.length, second_unaligned.length
|
27
|
+
@length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
|
28
|
+
|
29
|
+
first, second = first_unaligned, second_unaligned
|
30
|
+
second = second.reverse_complemented if revcomp?
|
31
|
+
|
32
|
+
if shift > 0
|
33
|
+
second = second.left_augmented(shift)
|
34
|
+
else
|
35
|
+
first = first.left_augmented(-shift)
|
36
|
+
end
|
37
|
+
|
38
|
+
@first = first.right_augmented(@length - first.length)
|
39
|
+
@second = second.right_augmented(@length - second.length)
|
40
|
+
end
|
41
|
+
|
42
|
+
def direct?
|
43
|
+
orientation == :direct
|
44
|
+
end
|
45
|
+
def revcomp?
|
46
|
+
orientation == :revcomp
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlap
|
50
|
+
length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
|
51
|
+
end
|
52
|
+
|
53
|
+
def first_pwm_alignment
|
54
|
+
length.times.map do |pos|
|
55
|
+
if first_overlaps?(pos)
|
56
|
+
'>'
|
57
|
+
else
|
58
|
+
'.'
|
59
|
+
end
|
60
|
+
end.join
|
61
|
+
end
|
62
|
+
|
63
|
+
def second_pwm_alignment
|
64
|
+
length.times.map do |pos|
|
65
|
+
if second_overlaps?(pos)
|
66
|
+
direct? ? '>' : '<'
|
67
|
+
else
|
68
|
+
'.'
|
69
|
+
end
|
70
|
+
end.join
|
71
|
+
end
|
72
|
+
|
73
|
+
def alignment_infos
|
74
|
+
{shift: shift,
|
75
|
+
orientation: orientation,
|
76
|
+
text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
|
77
|
+
overlap: overlap,
|
78
|
+
alignment_length: length}
|
79
|
+
end
|
80
|
+
|
81
|
+
# whether first matrix overlap specified position of alignment
|
82
|
+
def first_overlaps?(pos)
|
83
|
+
return false unless pos >= 0 && pos < length
|
84
|
+
if shift > 0
|
85
|
+
pos < first_length
|
86
|
+
else
|
87
|
+
pos >= -shift && pos < -shift + first_length
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def second_overlaps?(pos)
|
92
|
+
return false unless pos >= 0 && pos < length
|
93
|
+
if shift > 0
|
94
|
+
pos >= shift && pos < shift + second_length
|
95
|
+
else
|
96
|
+
pos < second_length
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def jaccard(first_threshold, second_threshold)
|
101
|
+
f = first.count_by_threshold(first_threshold)
|
102
|
+
s = second.count_by_threshold(second_threshold)
|
103
|
+
if f == 0 || s == 0
|
104
|
+
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
105
|
+
recognized_by_first: f,
|
106
|
+
recognized_by_second: s,
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
intersect = counts_for_two_matrices(first_threshold, second_threshold)
|
111
|
+
intersect = Math.sqrt(intersect[0] * intersect[1])
|
112
|
+
union = f + s - intersect
|
113
|
+
similarity = intersect.to_f / union
|
114
|
+
{ similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
|
115
|
+
recognized_by_first: f, recognized_by_second: s,
|
116
|
+
real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
|
117
|
+
end
|
118
|
+
|
119
|
+
def jaccard_by_pvalue(pvalue)
|
120
|
+
threshold_first = first.threshold(pvalue)
|
121
|
+
threshold_second = second.threshold(pvalue)
|
122
|
+
jaccard(threshold_first, threshold_second)
|
123
|
+
end
|
124
|
+
|
125
|
+
def jaccard_by_weak_pvalue(pvalue)
|
126
|
+
threshold_first = first.weak_threshold(pvalue)
|
127
|
+
threshold_second = second.weak_threshold(pvalue)
|
128
|
+
jaccard(threshold_first, threshold_second)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.calculate_alignment_length(first_len, second_len, shift)
|
132
|
+
if shift > 0
|
133
|
+
[first_len, second_len + shift].max
|
134
|
+
else
|
135
|
+
[first_len - shift, second_len].max
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -1,121 +1,175 @@
|
|
1
|
-
require 'bioinform'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
1
|
+
require 'bioinform'
|
2
|
+
require_relative 'support/inverf'
|
3
|
+
require_relative 'support/partial_sums'
|
4
|
+
|
5
|
+
module Macroape
|
6
|
+
class PWMCounting
|
7
|
+
attr_accessor :pwm, :max_hash_size, :background
|
8
|
+
|
9
|
+
def initialize(pwm, background: Bioinform::Background::Wordwise, max_hash_size: nil)
|
10
|
+
@pwm = pwm
|
11
|
+
@background = background
|
12
|
+
@max_hash_size = max_hash_size
|
13
|
+
end
|
14
|
+
|
15
|
+
def matrix
|
16
|
+
pwm.matrix
|
17
|
+
end
|
18
|
+
|
19
|
+
def vocabulary_volume
|
20
|
+
background.volume ** length
|
21
|
+
end
|
22
|
+
|
23
|
+
def threshold_gauss_estimation(max_pvalue)
|
24
|
+
pwm.threshold_gauss_estimation(max_pvalue)
|
25
|
+
end
|
26
|
+
|
27
|
+
def length
|
28
|
+
pwm.length
|
29
|
+
end
|
30
|
+
|
31
|
+
def best_score
|
32
|
+
best_suffix(0)
|
33
|
+
end
|
34
|
+
|
35
|
+
def worst_score
|
36
|
+
worst_suffix(0)
|
37
|
+
end
|
38
|
+
|
39
|
+
# best score of suffix s[i..l]
|
40
|
+
def best_suffix(i)
|
41
|
+
matrix[i...length].map(&:max).inject(0.0, &:+)
|
42
|
+
end
|
43
|
+
|
44
|
+
def worst_suffix(i)
|
45
|
+
matrix[i...length].map(&:min).inject(0.0, &:+)
|
46
|
+
end
|
47
|
+
|
48
|
+
def score_mean
|
49
|
+
pwm.each_position.inject(0.0){|mean, position| mean + background.mean(position) }
|
50
|
+
end
|
51
|
+
|
52
|
+
def score_variance
|
53
|
+
pwm.each_position.inject(0.0){|variance, position| variance + background.mean_square(position) - background.mean(position) **2 }
|
54
|
+
end
|
55
|
+
|
56
|
+
def threshold_gauss_estimation(pvalue)
|
57
|
+
sigma = Math.sqrt(score_variance)
|
58
|
+
n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
|
59
|
+
score_mean + n_ * sigma
|
60
|
+
end
|
61
|
+
|
62
|
+
def threshold(pvalue)
|
63
|
+
thresholds(pvalue){|_, thresh, _| return thresh }
|
64
|
+
end
|
65
|
+
def threshold_and_real_pvalue(pvalue)
|
66
|
+
thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
67
|
+
end
|
68
|
+
def weak_threshold(pvalue)
|
69
|
+
weak_thresholds(pvalue){|_, thresh, _| return thresh }
|
70
|
+
end
|
71
|
+
def weak_threshold_and_real_pvalue(pvalue)
|
72
|
+
weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
73
|
+
end
|
74
|
+
|
75
|
+
def thresholds(*pvalues)
|
76
|
+
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
77
|
+
threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
|
78
|
+
real_pvalue = counts.end.to_f / vocabulary_volume
|
79
|
+
yield pvalue, threshold, real_pvalue
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
|
84
|
+
def weak_thresholds(*pvalues)
|
85
|
+
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
86
|
+
threshold = thresholds.begin.to_f
|
87
|
+
real_pvalue = counts.begin.to_f / vocabulary_volume
|
88
|
+
yield pvalue, threshold, real_pvalue
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
def count_distribution_under_pvalue(max_pvalue)
|
94
|
+
cnt_distribution = {}
|
95
|
+
look_for_count = max_pvalue * vocabulary_volume
|
96
|
+
until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
|
97
|
+
begin
|
98
|
+
approximate_threshold = threshold_gauss_estimation(max_pvalue)
|
99
|
+
rescue
|
100
|
+
approximate_threshold = worst_score
|
101
|
+
end
|
102
|
+
cnt_distribution = count_distribution_after_threshold(approximate_threshold)
|
103
|
+
max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
|
104
|
+
end
|
105
|
+
|
106
|
+
cnt_distribution
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
# ret-value: hash {pvalue => [thresholds, counts]}
|
111
|
+
# thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
|
112
|
+
# counts = left_count .. right_count (left_count > right_count)
|
113
|
+
def thresholds_by_pvalues(*pvalues)
|
114
|
+
sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
|
115
|
+
scores = sorted_scores.map{|score,count| score}
|
116
|
+
counts = sorted_scores.map{|score,count| count}
|
117
|
+
partial_sums = counts.partial_sums
|
118
|
+
|
119
|
+
results = {}
|
120
|
+
|
121
|
+
pvalue_counts = pvalues.sort.each_with_object({}){|pvalue, hsh| hsh[pvalue] = pvalue * vocabulary_volume }
|
122
|
+
pvalue_counts.map do |pvalue,look_for_count|
|
123
|
+
ind = partial_sums.index{|sum| sum >= look_for_count}
|
124
|
+
minscore, count_at_minscore = scores[ind], partial_sums[ind]
|
125
|
+
maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
|
126
|
+
results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
|
127
|
+
end
|
128
|
+
|
129
|
+
results
|
130
|
+
end
|
131
|
+
|
132
|
+
def count_distribution_after_threshold(threshold)
|
133
|
+
return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
|
134
|
+
scores = { 0 => 1 }
|
135
|
+
length.times do |column|
|
136
|
+
scores.replace recalc_score_hash(scores, matrix[column], threshold - best_suffix(column + 1))
|
137
|
+
raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
|
138
|
+
end
|
139
|
+
scores
|
140
|
+
end
|
141
|
+
|
142
|
+
def count_distribution
|
143
|
+
@count_distribution ||= count_distribution_after_threshold(worst_score)
|
144
|
+
end
|
145
|
+
|
146
|
+
def recalc_score_hash(scores, column, least_sufficient)
|
147
|
+
new_scores = Hash.new(0)
|
148
|
+
scores.each do |score, count|
|
149
|
+
4.times do |letter|
|
150
|
+
new_score = score + column[letter]
|
151
|
+
if new_score >= least_sufficient
|
152
|
+
new_scores[new_score] += count * background.counts[letter]
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
new_scores
|
157
|
+
end
|
158
|
+
|
159
|
+
def counts_by_thresholds(*thresholds)
|
160
|
+
scores = count_distribution_after_threshold(thresholds.min)
|
161
|
+
thresholds.inject({}){ |hsh, threshold|
|
162
|
+
hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
163
|
+
hsh
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def count_by_threshold(threshold)
|
168
|
+
counts_by_thresholds(threshold)[threshold]
|
169
|
+
end
|
170
|
+
|
171
|
+
def pvalue_by_threshold(threshold)
|
172
|
+
count_by_threshold(threshold) / vocabulary_volume
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|