macroape 4.0.2 → 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +22 -22
- data/README.md +70 -70
- data/Rakefile.rb +49 -49
- data/TODO.txt +46 -46
- data/benchmark/benchmark_helper.rb +4 -4
- data/benchmark/similarity_benchmark.rb +52 -52
- data/bin/align_motifs +4 -4
- data/bin/eval_alignment +4 -4
- data/bin/eval_similarity +4 -4
- data/bin/find_pvalue +4 -4
- data/bin/find_threshold +4 -4
- data/bin/preprocess_collection +4 -4
- data/bin/scan_collection +4 -4
- data/lib/macroape.rb +14 -11
- data/lib/macroape/aligned_pair_intersection.rb +61 -62
- data/lib/macroape/cli.rb +191 -188
- data/lib/macroape/cli/align_motifs.rb +120 -100
- data/lib/macroape/cli/eval_alignment.rb +157 -156
- data/lib/macroape/cli/eval_similarity.rb +138 -137
- data/lib/macroape/cli/find_pvalue.rb +93 -87
- data/lib/macroape/cli/find_threshold.rb +103 -96
- data/lib/macroape/cli/preprocess_collection.rb +169 -161
- data/lib/macroape/cli/scan_collection.rb +171 -163
- data/lib/macroape/collection.rb +29 -0
- data/lib/macroape/motif_with_thresholds.rb +18 -0
- data/lib/macroape/pwm_compare.rb +39 -44
- data/lib/macroape/pwm_compare_aligned.rb +139 -130
- data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
- data/lib/macroape/support/inverf.rb +13 -0
- data/lib/macroape/support/partial_sums.rb +17 -0
- data/lib/macroape/version.rb +4 -4
- data/macroape.gemspec +19 -19
- data/spec/count_distribution_spec.rb +112 -109
- data/spec/inverf_spec.rb +23 -0
- data/spec/partial_sums_spec.rb +28 -0
- data/spec/spec_helper.rb +11 -11
- data/test/align_motifs_test.rb +42 -43
- data/test/data/AHR_si.pwm +10 -10
- data/test/data/KLF3_f1.pcm +16 -16
- data/test/data/KLF3_f1.pwm +16 -16
- data/test/data/KLF4_f2.pcm +11 -11
- data/test/data/KLF4_f2.pwm +11 -11
- data/test/data/KLF4_f2_scan_results_all.txt +2 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
- data/test/data/SP1_f1.pcm +12 -12
- data/test/data/SP1_f1.pwm +12 -12
- data/test/data/SP1_f1_revcomp.pcm +12 -12
- data/test/data/SP1_f1_revcomp.pwm +12 -12
- data/test/data/medium_motif.pwm +8 -8
- data/test/data/short_motif.pwm +7 -7
- data/test/data/test_collection.yaml +231 -214
- data/test/data/test_collection/GABPA_f1.pwm +14 -14
- data/test/data/test_collection/KLF4_f2.pwm +10 -10
- data/test/data/test_collection/SP1_f1.pwm +12 -12
- data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
- data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
- data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
- data/test/data/test_collection_single_file.txt +38 -38
- data/test/data/test_collection_single_file_pcm.txt +37 -37
- data/test/data/test_collection_weak.yaml +231 -214
- data/test/eval_alignment_test.rb +90 -111
- data/test/eval_similarity_test.rb +105 -123
- data/test/find_pvalue_test.rb +34 -39
- data/test/find_threshold_test.rb +87 -91
- data/test/preprocess_collection_test.rb +56 -65
- data/test/scan_collection_test.rb +42 -48
- data/test/test_helper.rb +159 -160
- metadata +14 -10
- data/test/data/collection_pcm_without_thresholds.yaml +0 -188
- data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,130 +1,139 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
def
|
54
|
-
length.times.map do |pos|
|
55
|
-
if
|
56
|
-
|
57
|
-
else
|
58
|
-
'.'
|
59
|
-
end
|
60
|
-
end.join
|
61
|
-
end
|
62
|
-
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
1
|
+
require_relative 'aligned_pair_intersection'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
class PWMCounting
|
5
|
+
def left_augmented(n)
|
6
|
+
PWMCounting.new(pwm.left_augmented(n), background: background, max_hash_size: max_hash_size)
|
7
|
+
end
|
8
|
+
def right_augmented(n)
|
9
|
+
PWMCounting.new(pwm.right_augmented(n), background: background, max_hash_size: max_hash_size)
|
10
|
+
end
|
11
|
+
def reverse_complemented
|
12
|
+
PWMCounting.new(pwm.reverse_complemented, background: background, max_hash_size: max_hash_size)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class PWMCompareAligned
|
17
|
+
# sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
18
|
+
attr_accessor :max_pair_hash_size
|
19
|
+
|
20
|
+
attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
|
21
|
+
|
22
|
+
# first_unaligned and second_unaligned - PWMCounting objects, not PWMs
|
23
|
+
def initialize(first_unaligned, second_unaligned, shift, orientation)
|
24
|
+
@shift, @orientation = shift, orientation
|
25
|
+
|
26
|
+
@first_length, @second_length = first_unaligned.length, second_unaligned.length
|
27
|
+
@length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
|
28
|
+
|
29
|
+
first, second = first_unaligned, second_unaligned
|
30
|
+
second = second.reverse_complemented if revcomp?
|
31
|
+
|
32
|
+
if shift > 0
|
33
|
+
second = second.left_augmented(shift)
|
34
|
+
else
|
35
|
+
first = first.left_augmented(-shift)
|
36
|
+
end
|
37
|
+
|
38
|
+
@first = first.right_augmented(@length - first.length)
|
39
|
+
@second = second.right_augmented(@length - second.length)
|
40
|
+
end
|
41
|
+
|
42
|
+
def direct?
|
43
|
+
orientation == :direct
|
44
|
+
end
|
45
|
+
def revcomp?
|
46
|
+
orientation == :revcomp
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlap
|
50
|
+
length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
|
51
|
+
end
|
52
|
+
|
53
|
+
def first_pwm_alignment
|
54
|
+
length.times.map do |pos|
|
55
|
+
if first_overlaps?(pos)
|
56
|
+
'>'
|
57
|
+
else
|
58
|
+
'.'
|
59
|
+
end
|
60
|
+
end.join
|
61
|
+
end
|
62
|
+
|
63
|
+
def second_pwm_alignment
|
64
|
+
length.times.map do |pos|
|
65
|
+
if second_overlaps?(pos)
|
66
|
+
direct? ? '>' : '<'
|
67
|
+
else
|
68
|
+
'.'
|
69
|
+
end
|
70
|
+
end.join
|
71
|
+
end
|
72
|
+
|
73
|
+
def alignment_infos
|
74
|
+
{shift: shift,
|
75
|
+
orientation: orientation,
|
76
|
+
text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
|
77
|
+
overlap: overlap,
|
78
|
+
alignment_length: length}
|
79
|
+
end
|
80
|
+
|
81
|
+
# whether first matrix overlap specified position of alignment
|
82
|
+
def first_overlaps?(pos)
|
83
|
+
return false unless pos >= 0 && pos < length
|
84
|
+
if shift > 0
|
85
|
+
pos < first_length
|
86
|
+
else
|
87
|
+
pos >= -shift && pos < -shift + first_length
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def second_overlaps?(pos)
|
92
|
+
return false unless pos >= 0 && pos < length
|
93
|
+
if shift > 0
|
94
|
+
pos >= shift && pos < shift + second_length
|
95
|
+
else
|
96
|
+
pos < second_length
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def jaccard(first_threshold, second_threshold)
|
101
|
+
f = first.count_by_threshold(first_threshold)
|
102
|
+
s = second.count_by_threshold(second_threshold)
|
103
|
+
if f == 0 || s == 0
|
104
|
+
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
105
|
+
recognized_by_first: f,
|
106
|
+
recognized_by_second: s,
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
intersect = counts_for_two_matrices(first_threshold, second_threshold)
|
111
|
+
intersect = Math.sqrt(intersect[0] * intersect[1])
|
112
|
+
union = f + s - intersect
|
113
|
+
similarity = intersect.to_f / union
|
114
|
+
{ similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
|
115
|
+
recognized_by_first: f, recognized_by_second: s,
|
116
|
+
real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
|
117
|
+
end
|
118
|
+
|
119
|
+
def jaccard_by_pvalue(pvalue)
|
120
|
+
threshold_first = first.threshold(pvalue)
|
121
|
+
threshold_second = second.threshold(pvalue)
|
122
|
+
jaccard(threshold_first, threshold_second)
|
123
|
+
end
|
124
|
+
|
125
|
+
def jaccard_by_weak_pvalue(pvalue)
|
126
|
+
threshold_first = first.weak_threshold(pvalue)
|
127
|
+
threshold_second = second.weak_threshold(pvalue)
|
128
|
+
jaccard(threshold_first, threshold_second)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.calculate_alignment_length(first_len, second_len, shift)
|
132
|
+
if shift > 0
|
133
|
+
[first_len, second_len + shift].max
|
134
|
+
else
|
135
|
+
[first_len - shift, second_len].max
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -1,121 +1,175 @@
|
|
1
|
-
require 'bioinform'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
1
|
+
require 'bioinform'
|
2
|
+
require_relative 'support/inverf'
|
3
|
+
require_relative 'support/partial_sums'
|
4
|
+
|
5
|
+
module Macroape
|
6
|
+
class PWMCounting
|
7
|
+
attr_accessor :pwm, :max_hash_size, :background
|
8
|
+
|
9
|
+
def initialize(pwm, background: Bioinform::Background::Wordwise, max_hash_size: nil)
|
10
|
+
@pwm = pwm
|
11
|
+
@background = background
|
12
|
+
@max_hash_size = max_hash_size
|
13
|
+
end
|
14
|
+
|
15
|
+
def matrix
|
16
|
+
pwm.matrix
|
17
|
+
end
|
18
|
+
|
19
|
+
def vocabulary_volume
|
20
|
+
background.volume ** length
|
21
|
+
end
|
22
|
+
|
23
|
+
def threshold_gauss_estimation(max_pvalue)
|
24
|
+
pwm.threshold_gauss_estimation(max_pvalue)
|
25
|
+
end
|
26
|
+
|
27
|
+
def length
|
28
|
+
pwm.length
|
29
|
+
end
|
30
|
+
|
31
|
+
def best_score
|
32
|
+
best_suffix(0)
|
33
|
+
end
|
34
|
+
|
35
|
+
def worst_score
|
36
|
+
worst_suffix(0)
|
37
|
+
end
|
38
|
+
|
39
|
+
# best score of suffix s[i..l]
|
40
|
+
def best_suffix(i)
|
41
|
+
matrix[i...length].map(&:max).inject(0.0, &:+)
|
42
|
+
end
|
43
|
+
|
44
|
+
def worst_suffix(i)
|
45
|
+
matrix[i...length].map(&:min).inject(0.0, &:+)
|
46
|
+
end
|
47
|
+
|
48
|
+
def score_mean
|
49
|
+
pwm.each_position.inject(0.0){|mean, position| mean + background.mean(position) }
|
50
|
+
end
|
51
|
+
|
52
|
+
def score_variance
|
53
|
+
pwm.each_position.inject(0.0){|variance, position| variance + background.mean_square(position) - background.mean(position) **2 }
|
54
|
+
end
|
55
|
+
|
56
|
+
def threshold_gauss_estimation(pvalue)
|
57
|
+
sigma = Math.sqrt(score_variance)
|
58
|
+
n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
|
59
|
+
score_mean + n_ * sigma
|
60
|
+
end
|
61
|
+
|
62
|
+
def threshold(pvalue)
|
63
|
+
thresholds(pvalue){|_, thresh, _| return thresh }
|
64
|
+
end
|
65
|
+
def threshold_and_real_pvalue(pvalue)
|
66
|
+
thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
67
|
+
end
|
68
|
+
def weak_threshold(pvalue)
|
69
|
+
weak_thresholds(pvalue){|_, thresh, _| return thresh }
|
70
|
+
end
|
71
|
+
def weak_threshold_and_real_pvalue(pvalue)
|
72
|
+
weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
73
|
+
end
|
74
|
+
|
75
|
+
def thresholds(*pvalues)
|
76
|
+
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
77
|
+
threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
|
78
|
+
real_pvalue = counts.end.to_f / vocabulary_volume
|
79
|
+
yield pvalue, threshold, real_pvalue
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
|
84
|
+
def weak_thresholds(*pvalues)
|
85
|
+
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
86
|
+
threshold = thresholds.begin.to_f
|
87
|
+
real_pvalue = counts.begin.to_f / vocabulary_volume
|
88
|
+
yield pvalue, threshold, real_pvalue
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
def count_distribution_under_pvalue(max_pvalue)
|
94
|
+
cnt_distribution = {}
|
95
|
+
look_for_count = max_pvalue * vocabulary_volume
|
96
|
+
until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
|
97
|
+
begin
|
98
|
+
approximate_threshold = threshold_gauss_estimation(max_pvalue)
|
99
|
+
rescue
|
100
|
+
approximate_threshold = worst_score
|
101
|
+
end
|
102
|
+
cnt_distribution = count_distribution_after_threshold(approximate_threshold)
|
103
|
+
max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
|
104
|
+
end
|
105
|
+
|
106
|
+
cnt_distribution
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
# ret-value: hash {pvalue => [thresholds, counts]}
|
111
|
+
# thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
|
112
|
+
# counts = left_count .. right_count (left_count > right_count)
|
113
|
+
def thresholds_by_pvalues(*pvalues)
|
114
|
+
sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
|
115
|
+
scores = sorted_scores.map{|score,count| score}
|
116
|
+
counts = sorted_scores.map{|score,count| count}
|
117
|
+
partial_sums = counts.partial_sums
|
118
|
+
|
119
|
+
results = {}
|
120
|
+
|
121
|
+
pvalue_counts = pvalues.sort.each_with_object({}){|pvalue, hsh| hsh[pvalue] = pvalue * vocabulary_volume }
|
122
|
+
pvalue_counts.map do |pvalue,look_for_count|
|
123
|
+
ind = partial_sums.index{|sum| sum >= look_for_count}
|
124
|
+
minscore, count_at_minscore = scores[ind], partial_sums[ind]
|
125
|
+
maxscore, count_at_maxscore = ind > 0 ? [ scores[ind-1], partial_sums[ind-1] ] : [ best_score + 1.0, 0.0 ]
|
126
|
+
results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
|
127
|
+
end
|
128
|
+
|
129
|
+
results
|
130
|
+
end
|
131
|
+
|
132
|
+
def count_distribution_after_threshold(threshold)
|
133
|
+
return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
|
134
|
+
scores = { 0 => 1 }
|
135
|
+
length.times do |column|
|
136
|
+
scores.replace recalc_score_hash(scores, matrix[column], threshold - best_suffix(column + 1))
|
137
|
+
raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if max_hash_size && scores.size > max_hash_size
|
138
|
+
end
|
139
|
+
scores
|
140
|
+
end
|
141
|
+
|
142
|
+
def count_distribution
|
143
|
+
@count_distribution ||= count_distribution_after_threshold(worst_score)
|
144
|
+
end
|
145
|
+
|
146
|
+
def recalc_score_hash(scores, column, least_sufficient)
|
147
|
+
new_scores = Hash.new(0)
|
148
|
+
scores.each do |score, count|
|
149
|
+
4.times do |letter|
|
150
|
+
new_score = score + column[letter]
|
151
|
+
if new_score >= least_sufficient
|
152
|
+
new_scores[new_score] += count * background.counts[letter]
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
new_scores
|
157
|
+
end
|
158
|
+
|
159
|
+
def counts_by_thresholds(*thresholds)
|
160
|
+
scores = count_distribution_after_threshold(thresholds.min)
|
161
|
+
thresholds.inject({}){ |hsh, threshold|
|
162
|
+
hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
163
|
+
hsh
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def count_by_threshold(threshold)
|
168
|
+
counts_by_thresholds(threshold)[threshold]
|
169
|
+
end
|
170
|
+
|
171
|
+
def pvalue_by_threshold(threshold)
|
172
|
+
count_by_threshold(threshold) / vocabulary_volume
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|