macroape 3.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +61 -0
- data/Rakefile +7 -0
- data/bin/eval_alignment +3 -0
- data/bin/eval_similarity +3 -0
- data/bin/find_pvalue +3 -0
- data/bin/find_threshold +3 -0
- data/bin/preprocess_collection +3 -0
- data/bin/scan_collection +3 -0
- data/lib/macroape/aligned_pair_intersection.rb +136 -0
- data/lib/macroape/aligned_pair_metrics.rb +24 -0
- data/lib/macroape/aligned_pair_transformations.rb +23 -0
- data/lib/macroape/collection.rb +15 -0
- data/lib/macroape/count_by_threshold.rb +34 -0
- data/lib/macroape/exec/eval_alignment.rb +141 -0
- data/lib/macroape/exec/eval_similarity.rb +107 -0
- data/lib/macroape/exec/find_pvalue.rb +80 -0
- data/lib/macroape/exec/find_threshold.rb +76 -0
- data/lib/macroape/exec/preprocess_collection.rb +94 -0
- data/lib/macroape/exec/scan_collection.rb +124 -0
- data/lib/macroape/extract_pwm.rb +32 -0
- data/lib/macroape/gauss_estimation.rb +30 -0
- data/lib/macroape/matrix_information.rb +29 -0
- data/lib/macroape/matrix_on_background.rb +16 -0
- data/lib/macroape/matrix_transformations.rb +29 -0
- data/lib/macroape/pair_metrics.rb +9 -0
- data/lib/macroape/pair_transformations.rb +28 -0
- data/lib/macroape/pwm_compare.rb +10 -0
- data/lib/macroape/pwm_compare_aligned.rb +13 -0
- data/lib/macroape/single_matrix.rb +45 -0
- data/lib/macroape/support.rb +34 -0
- data/lib/macroape/threshold_by_pvalue.rb +68 -0
- data/lib/macroape/version.rb +3 -0
- data/lib/macroape.rb +26 -0
- data/macroape.gemspec +17 -0
- data/test/data/AHR_si.pat +10 -0
- data/test/data/KLF4_f2.pat +11 -0
- data/test/data/KLF4_f2_scan_results_all.txt +4 -0
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
- data/test/data/SP1_f1.pat +12 -0
- data/test/data/SP1_f1_revcomp.pat +12 -0
- data/test/data/test_collection/GABPA_f1.pat +14 -0
- data/test/data/test_collection/KLF4_f2.pat +11 -0
- data/test/data/test_collection/SP1_f1.pat +12 -0
- data/test/data/test_collection.yaml +186 -0
- data/test/macroape_test.rb +125 -0
- metadata +116 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Macroape
|
2
|
+
|
3
|
+
Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'macroape'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install macroape
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
|
21
|
+
|
22
|
+
## Basic usage as a command-line tool
|
23
|
+
MacroAPE have 6 command line tools:
|
24
|
+
|
25
|
+
### Tools for calculating thresholds and pvalues:
|
26
|
+
* find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
|
27
|
+
* find_pvalue \<PWM file\> \<threshold\>
|
28
|
+
|
29
|
+
### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
|
30
|
+
* eval_similarity \<first PWM file\> \<second PWM file\>
|
31
|
+
* eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
|
32
|
+
|
33
|
+
### Tools for looking through collection for the motifs most similar to a query motif
|
34
|
+
* preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
|
35
|
+
* scan_collection \<query PWM file\> \<collection file\>
|
36
|
+
|
37
|
+
Also you can use -h option to print help for a tool in console.
|
38
|
+
There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
|
39
|
+
|
40
|
+
## Basic usage in your code
|
41
|
+
require 'macroape'
|
42
|
+
background = [1,1,1,1]
|
43
|
+
discretization = 10
|
44
|
+
first_pwm_matrix = [[1,2,3,4], [1,2,3,4], [4,1,2,3,], [5,3,2,4], [4,1,2,3], [7,8,9,11]]
|
45
|
+
pwm_first = PWM::SingleMatrix.new(first_pwm_matrix).with_background(background).discrete(discretization)
|
46
|
+
pwm_second = PWM::SingleMatrix.load_pat('another_pwm.pat').with_background(background).discrete(discretization)
|
47
|
+
cmp = PWMCompare::PWMCompare.new(pwm_first, pwm_second)
|
48
|
+
first_threshold = pwm_first.threshold(pvalue)
|
49
|
+
second_threshold = pwm_second.threshold(pvalue)
|
50
|
+
similarity_info = cmp.jaccard(first_threshold, second_threshold)
|
51
|
+
puts "Jaccard similarity: #{similarity_info[:similarity]}"
|
52
|
+
|
53
|
+
## Contributing
|
54
|
+
|
55
|
+
1. Fork it
|
56
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
57
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
58
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
59
|
+
5. Create new Pull Request
|
60
|
+
|
61
|
+
Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
|
data/Rakefile
ADDED
data/bin/eval_alignment
ADDED
data/bin/eval_similarity
ADDED
data/bin/find_pvalue
ADDED
data/bin/find_threshold
ADDED
data/bin/scan_collection
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module AlignedPairIntersection
|
3
|
+
|
4
|
+
def counts_for_two_matrices(threshold_first, threshold_second)
|
5
|
+
if first.probabilities == second.probabilities
|
6
|
+
if first.probabilities == [1,1,1,1]
|
7
|
+
common_words_for_two_matrices(threshold_first, threshold_second)
|
8
|
+
else
|
9
|
+
counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
|
10
|
+
end
|
11
|
+
else
|
12
|
+
counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
|
17
|
+
scores = { 0 => {0 => [1,1]} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
|
18
|
+
result_first = 0.0
|
19
|
+
result_second = 0.0
|
20
|
+
length.times do |column|
|
21
|
+
ending_weight_first = first.sum_of_probabilities ** (length - column - 1)
|
22
|
+
ending_weight_second = second.sum_of_probabilities ** (length - column - 1)
|
23
|
+
already_enough_first = threshold_first - first.worst_suffix[column + 1]
|
24
|
+
already_enough_second = threshold_second - second.worst_suffix[column + 1]
|
25
|
+
least_sufficient_first = threshold_first - first.best_suffix[column + 1]
|
26
|
+
least_sufficient_second = threshold_second - second.best_suffix[column + 1]
|
27
|
+
|
28
|
+
new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=[0,0]}}
|
29
|
+
scores.each do |score_first, second_scores|
|
30
|
+
second_scores.each do |score_second, count|
|
31
|
+
4.times do |letter|
|
32
|
+
new_score_first = score_first + first.matrix[column][letter]
|
33
|
+
if new_score_first >= already_enough_first
|
34
|
+
new_score_second = score_second + second.matrix[column][letter]
|
35
|
+
if new_score_second >= already_enough_second
|
36
|
+
result_first += count[0] * first.probabilities[letter] * ending_weight_first
|
37
|
+
result_second += count[1] * second.probabilities[letter] * ending_weight_second
|
38
|
+
elsif new_score_second >= least_sufficient_second
|
39
|
+
new_scores[new_score_first][new_score_second][0] += count[0] * first.probabilities[letter]
|
40
|
+
new_scores[new_score_first][new_score_second][1] += count[1] * second.probabilities[letter]
|
41
|
+
end
|
42
|
+
elsif new_score_first >= least_sufficient_first
|
43
|
+
new_score_second = score_second + second.matrix[column][letter]
|
44
|
+
if new_score_second >= least_sufficient_second
|
45
|
+
new_scores[new_score_first][new_score_second][0] += count[0] * first.probabilities[letter]
|
46
|
+
new_scores[new_score_first][new_score_second][1] += count[1] * second.probabilities[letter]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
raise 'Hash overflow in PWMCompare::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
|
53
|
+
scores = new_scores
|
54
|
+
end
|
55
|
+
[result_first, result_second]
|
56
|
+
end
|
57
|
+
|
58
|
+
def counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
|
59
|
+
scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
|
60
|
+
result = 0.0
|
61
|
+
probabilities = first.probabilities
|
62
|
+
length.times do |column|
|
63
|
+
ending_weight = first.sum_of_probabilities ** (length - column - 1)
|
64
|
+
already_enough_first = threshold_first - first.worst_suffix[column + 1]
|
65
|
+
already_enough_second = threshold_second - second.worst_suffix[column + 1]
|
66
|
+
least_sufficient_first = threshold_first - first.best_suffix[column + 1]
|
67
|
+
least_sufficient_second = threshold_second - second.best_suffix[column + 1]
|
68
|
+
|
69
|
+
new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
|
70
|
+
scores.each do |score_first, second_scores|
|
71
|
+
second_scores.each do |score_second, count|
|
72
|
+
4.times do |letter|
|
73
|
+
new_score_first = score_first + first.matrix[column][letter]
|
74
|
+
if new_score_first >= already_enough_first
|
75
|
+
new_score_second = score_second + second.matrix[column][letter]
|
76
|
+
if new_score_second >= already_enough_second
|
77
|
+
result += count * probabilities[letter] * ending_weight
|
78
|
+
elsif new_score_second >= least_sufficient_second
|
79
|
+
new_scores[new_score_first][new_score_second] += count * probabilities[letter]
|
80
|
+
end
|
81
|
+
elsif new_score_first >= least_sufficient_first
|
82
|
+
new_score_second = score_second + second.matrix[column][letter]
|
83
|
+
if new_score_second >= least_sufficient_second
|
84
|
+
new_scores[new_score_first][new_score_second] += count * probabilities[letter]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
raise 'Hash overflow in PWMCompare::AlignedPairIntersection#counts_for_two_matrices_with_same_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
|
91
|
+
scores = new_scores
|
92
|
+
end
|
93
|
+
[result, result]
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
def common_words_for_two_matrices(threshold_first, threshold_second)
|
98
|
+
scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
|
99
|
+
result = 0
|
100
|
+
length.times do |column|
|
101
|
+
ending_weight = 4 ** (length - column - 1)
|
102
|
+
already_enough_first = threshold_first - first.worst_suffix[column + 1]
|
103
|
+
already_enough_second = threshold_second - second.worst_suffix[column + 1]
|
104
|
+
least_sufficient_first = threshold_first - first.best_suffix[column + 1]
|
105
|
+
least_sufficient_second = threshold_second - second.best_suffix[column + 1]
|
106
|
+
|
107
|
+
new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
|
108
|
+
scores.each do |score_first, second_scores|
|
109
|
+
second_scores.each do |score_second, count|
|
110
|
+
4.times do |letter|
|
111
|
+
new_score_first = score_first + first.matrix[column][letter]
|
112
|
+
if new_score_first >= already_enough_first
|
113
|
+
new_score_second = score_second + second.matrix[column][letter]
|
114
|
+
if new_score_second >= already_enough_second
|
115
|
+
result += count * ending_weight
|
116
|
+
elsif new_score_second >= least_sufficient_second
|
117
|
+
new_scores[new_score_first][new_score_second] += count
|
118
|
+
end
|
119
|
+
elsif new_score_first >= least_sufficient_first
|
120
|
+
new_score_second = score_second + second.matrix[column][letter]
|
121
|
+
if new_score_second >= least_sufficient_second
|
122
|
+
new_scores[new_score_first][new_score_second] += count
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
raise 'Hash overflow in PWMCompare::AlignedPairIntersection#common_words_for_two_matrices' if defined? MaxHashSize and new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
|
130
|
+
scores = new_scores
|
131
|
+
end
|
132
|
+
[result, result]
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module AlignedPairMetrics
|
3
|
+
def jaccard(first_threshold, second_threshold)
|
4
|
+
f = first.counts_by_thresholds(first_threshold).first
|
5
|
+
s = second.counts_by_thresholds(second_threshold).first
|
6
|
+
if f == 0 or s == 0
|
7
|
+
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
8
|
+
recognized_by_first: f,
|
9
|
+
recognized_by_second: s,
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
intersect = counts_for_two_matrices(first_threshold, second_threshold)
|
14
|
+
intersect = Math.sqrt(intersect[0] * intersect[1])
|
15
|
+
union = f + s - intersect
|
16
|
+
similarity = intersect.to_f / union
|
17
|
+
{ similarity: similarity,
|
18
|
+
tanimoto: 1.0 - similarity,
|
19
|
+
recognized_by_both: intersect,
|
20
|
+
recognized_by_first: f,
|
21
|
+
recognized_by_second: s }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module AlignedPairTransformations
|
3
|
+
|
4
|
+
#def discrete(rate)
|
5
|
+
# PWMCompareAligned.new(first.discrete(rate), second.discrete(rate))
|
6
|
+
#end
|
7
|
+
|
8
|
+
def sort_pair_of_matrices_by(&block)
|
9
|
+
mat = first.pwm.zip(second.pwm).sort_by(&block).transpose
|
10
|
+
PWMCompareAligned.new(SinglePWM(mat[0],first.probabilities), SinglePWM(mat[1], second.probabilities))
|
11
|
+
end
|
12
|
+
def sort_decreasing_max
|
13
|
+
PWMCompareAligned.new(*sort_pair_of_matrices_by{|col_pair| -col_pair[0].max} )
|
14
|
+
end
|
15
|
+
def sort_increasing_min
|
16
|
+
PWMCompareAligned.new(*sort_pair_of_matrices_by{|col_pair| col_pair[0].min} )
|
17
|
+
end
|
18
|
+
def permute_columns(permutation_index)
|
19
|
+
PWMCompareAligned.new(first.permute(permutation_index), second.permute(permutation_index))
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module PWM
|
2
|
+
class Collection
|
3
|
+
attr_reader :rough_discretization, :precise_discretization, :background, :pvalues, :pwms, :infos
|
4
|
+
def initialize(rough_discretization, precise_discretization, background, pvalues)
|
5
|
+
@rough_discretization, @precise_discretization, @background, @pvalues = rough_discretization, precise_discretization, background, pvalues
|
6
|
+
@pwms={}
|
7
|
+
@infos={}
|
8
|
+
end
|
9
|
+
def add_pwm(pwm,info)
|
10
|
+
#@pwms[pwm] = info
|
11
|
+
@pwms[pwm.name] = pwm
|
12
|
+
@infos[pwm.name] = info
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module PWM
|
2
|
+
module CountByThreshold
|
3
|
+
def counts_by_thresholds(*thresholds)
|
4
|
+
scores = calculate_count_distribution_after_threshold(thresholds.min)
|
5
|
+
=begin
|
6
|
+
thresholds.map{ |threshold|
|
7
|
+
#scores.select{|score,count| score >= threshold}.map{|score,count| count}.inject(0){|sum,val|sum+val}
|
8
|
+
scores.inject(0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
9
|
+
}
|
10
|
+
=end
|
11
|
+
s_thr= thresholds.map.with_index{|threshold,index|[threshold,index]}.sort_by{|threshold,index| threshold}
|
12
|
+
|
13
|
+
cnt = 0
|
14
|
+
thr_cnts=[]
|
15
|
+
|
16
|
+
scores.sort.reverse.each do |score,count|
|
17
|
+
while !s_thr.empty? and score < s_thr.last[0]
|
18
|
+
thr_cnts.push(cnt)
|
19
|
+
s_thr.pop
|
20
|
+
end
|
21
|
+
cnt += count
|
22
|
+
end
|
23
|
+
s_thr = thresholds.map.with_index{|threshold,index|[threshold,index]}.sort_by{|threshold,index| threshold}
|
24
|
+
while thr_cnts.size < s_thr.size
|
25
|
+
thr_cnts.push(cnt)
|
26
|
+
end
|
27
|
+
s_thr.reverse.zip(thr_cnts).sort_by{|(threshold,index), count| index}.map{|(threshold,index), count| count.to_f}
|
28
|
+
end
|
29
|
+
|
30
|
+
def pvalue_by_threshold(threshold)
|
31
|
+
counts_by_thresholds(threshold).first / number_of_words
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
|
4
|
+
type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
|
5
|
+
or in linux
|
6
|
+
cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
|
7
|
+
|
8
|
+
Options:
|
9
|
+
[-p <P-value>]
|
10
|
+
[-d <discretization level>]
|
11
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
12
|
+
|
13
|
+
Output format:
|
14
|
+
<jaccard similarity coefficient>
|
15
|
+
<number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
|
16
|
+
<aligned 1st matrix>
|
17
|
+
<aligned 2nd matrix>
|
18
|
+
<shift> <orientation>
|
19
|
+
|
20
|
+
Examples:
|
21
|
+
ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
22
|
+
or on windows
|
23
|
+
type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
24
|
+
or in linux
|
25
|
+
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
26
|
+
}
|
27
|
+
|
28
|
+
require 'macroape'
|
29
|
+
|
30
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
31
|
+
STDERR.puts help_string
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
pvalue = 0.0005
|
36
|
+
discretization = 10
|
37
|
+
|
38
|
+
first_background = [1,1,1,1]
|
39
|
+
second_background = [1,1,1,1]
|
40
|
+
|
41
|
+
begin
|
42
|
+
first_file = ARGV.shift
|
43
|
+
second_file = ARGV.shift
|
44
|
+
|
45
|
+
shift = ARGV.shift
|
46
|
+
orientation = ARGV.shift
|
47
|
+
|
48
|
+
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
49
|
+
raise 'You\'d specify shift' unless shift
|
50
|
+
raise 'You\'d specify orientation' unless orientation
|
51
|
+
|
52
|
+
shift = shift.to_i
|
53
|
+
orientation = orientation.to_sym
|
54
|
+
|
55
|
+
case orientation
|
56
|
+
when :direct
|
57
|
+
reverse = false
|
58
|
+
when :revcomp
|
59
|
+
reverse = true
|
60
|
+
else
|
61
|
+
raise 'Unknown orientation(direct/revcomp)'
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
until ARGV.empty?
|
66
|
+
case ARGV.shift
|
67
|
+
when '-p'
|
68
|
+
pvalue = ARGV.shift.to_f
|
69
|
+
when '-d'
|
70
|
+
discretization = ARGV.shift.to_f
|
71
|
+
when '-m'
|
72
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
73
|
+
when '-md'
|
74
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
75
|
+
when '-b'
|
76
|
+
second_background = first_background = ARGV.shift(4).map(&:to_f)
|
77
|
+
when '-b1'
|
78
|
+
first_background = ARGV.shift(4).map(&:to_f)
|
79
|
+
when '-b2'
|
80
|
+
second_background = ARGV.shift(4).map(&:to_f)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
raise 'background should be symmetric' unless first_background == first_background.reverse
|
84
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
85
|
+
|
86
|
+
|
87
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
88
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
89
|
+
|
90
|
+
if first_file == '.stdin' || second_file == '.stdin'
|
91
|
+
r_stream, w_stream = IO.pipe
|
92
|
+
STDIN.readlines.each{|line| w_stream.write(line)}
|
93
|
+
w_stream.close
|
94
|
+
end
|
95
|
+
|
96
|
+
if first_file == '.stdin'
|
97
|
+
r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
98
|
+
pwm_first = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
99
|
+
else
|
100
|
+
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
101
|
+
pwm_first = PWM::SingleMatrix.load_pat(first_file).with_background(first_background).discrete(discretization)
|
102
|
+
end
|
103
|
+
|
104
|
+
if second_file == '.stdin'
|
105
|
+
r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
106
|
+
pwm_second = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
107
|
+
else
|
108
|
+
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
109
|
+
pwm_second = PWM::SingleMatrix.load_pat(second_file).with_background(second_background).discrete(discretization)
|
110
|
+
end
|
111
|
+
|
112
|
+
r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
113
|
+
|
114
|
+
|
115
|
+
pwm_second = pwm_second.reverse_complement if reverse
|
116
|
+
|
117
|
+
first_pwm_alignment = '.' * [-shift, 0].max + '>' * pwm_first.length
|
118
|
+
second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * pwm_second.length
|
119
|
+
overlap = [pwm_first.length + [-shift,0].max, pwm_second.length + [shift,0].max].min - shift.abs
|
120
|
+
alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
|
121
|
+
(first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
|
122
|
+
(second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
|
123
|
+
|
124
|
+
cmp = PWMCompare::PWMCompareAligned.new(pwm_first.left_augment([-shift,0].max),
|
125
|
+
pwm_second.left_augment([shift,0].max))
|
126
|
+
|
127
|
+
first_threshold = pwm_first.threshold(pvalue)
|
128
|
+
second_threshold = pwm_second.threshold(pvalue)
|
129
|
+
|
130
|
+
info = cmp.jaccard(first_threshold, second_threshold).merge(
|
131
|
+
text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
|
132
|
+
shift: shift,
|
133
|
+
orientation: orientation,
|
134
|
+
overlap: overlap,
|
135
|
+
alignment_length: alignment_length)
|
136
|
+
|
137
|
+
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
138
|
+
|
139
|
+
rescue => err
|
140
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
141
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
|
4
|
+
or on windows
|
5
|
+
type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
6
|
+
or in linux
|
7
|
+
cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-p <P-value>]
|
11
|
+
[-d <discretization level>]
|
12
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
13
|
+
|
14
|
+
Output has format:
|
15
|
+
<jaccard similarity coefficient>
|
16
|
+
<number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
|
17
|
+
<optimal alignment, the 1st matrix>
|
18
|
+
<optimal alignment, the 2nd matrix>
|
19
|
+
<shift> <orientation>
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
23
|
+
or on windows
|
24
|
+
type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
25
|
+
or in linux
|
26
|
+
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
27
|
+
}
|
28
|
+
|
29
|
+
require 'macroape'
|
30
|
+
|
31
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
32
|
+
STDERR.puts help_string
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
|
36
|
+
pvalue = 0.0005
|
37
|
+
discretization = 10
|
38
|
+
|
39
|
+
first_background = [1,1,1,1]
|
40
|
+
second_background = [1,1,1,1]
|
41
|
+
|
42
|
+
begin
|
43
|
+
first_file = ARGV.shift
|
44
|
+
second_file = ARGV.shift
|
45
|
+
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
46
|
+
|
47
|
+
until ARGV.empty?
|
48
|
+
case ARGV.shift
|
49
|
+
when '-p'
|
50
|
+
pvalue = ARGV.shift.to_f
|
51
|
+
when '-d'
|
52
|
+
discretization = ARGV.shift.to_f
|
53
|
+
when '-m'
|
54
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
55
|
+
when '-md'
|
56
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
57
|
+
when '-b'
|
58
|
+
second_background = first_background = ARGV.shift(4).map(&:to_f)
|
59
|
+
when '-b1'
|
60
|
+
first_background = ARGV.shift(4).map(&:to_f)
|
61
|
+
when '-b2'
|
62
|
+
second_background = ARGV.shift(4).map(&:to_f)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
66
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
67
|
+
|
68
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
69
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
70
|
+
|
71
|
+
|
72
|
+
if first_file == '.stdin' || second_file == '.stdin'
|
73
|
+
r_stream, w_stream = IO.pipe
|
74
|
+
STDIN.readlines.each{|line| w_stream.write(line)}
|
75
|
+
w_stream.close
|
76
|
+
end
|
77
|
+
|
78
|
+
if first_file == '.stdin'
|
79
|
+
r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
80
|
+
pwm_first = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
81
|
+
else
|
82
|
+
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
83
|
+
pwm_first = PWM::SingleMatrix.load_pat(first_file).with_background(first_background).discrete(discretization)
|
84
|
+
end
|
85
|
+
|
86
|
+
if second_file == '.stdin'
|
87
|
+
r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
88
|
+
pwm_second = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
89
|
+
else
|
90
|
+
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
91
|
+
pwm_second = PWM::SingleMatrix.load_pat(second_file).with_background(second_background).discrete(discretization)
|
92
|
+
end
|
93
|
+
|
94
|
+
r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
95
|
+
|
96
|
+
cmp = PWMCompare::PWMCompare.new(pwm_first, pwm_second)
|
97
|
+
|
98
|
+
first_threshold = pwm_first.threshold(pvalue)
|
99
|
+
second_threshold = pwm_second.threshold(pvalue)
|
100
|
+
|
101
|
+
info = cmp.jaccard(first_threshold, second_threshold)
|
102
|
+
|
103
|
+
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
104
|
+
|
105
|
+
rescue => err
|
106
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
107
|
+
end
|