macroape 3.3.2 → 3.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -1
- data/Rakefile.rb +65 -0
- data/TODO.txt +20 -0
- data/benchmark/similarity_benchmark.rb +56 -0
- data/lib/macroape.rb +1 -2
- data/lib/macroape/aligned_pair_intersection.rb +43 -116
- data/lib/macroape/collection.rb +4 -4
- data/lib/macroape/{threshold_by_pvalue.rb → counting.rb} +28 -18
- data/lib/macroape/exec/eval_alignment.rb +19 -22
- data/lib/macroape/exec/eval_similarity.rb +13 -13
- data/lib/macroape/exec/find_pvalue.rb +7 -7
- data/lib/macroape/exec/find_threshold.rb +8 -8
- data/lib/macroape/exec/preprocess_collection.rb +8 -8
- data/lib/macroape/exec/scan_collection.rb +16 -16
- data/lib/macroape/pwm_compare.rb +2 -3
- data/lib/macroape/pwm_compare_aligned.rb +34 -26
- data/lib/macroape/version.rb +1 -1
- data/spec/count_distribution_spec.rb +52 -0
- data/spec/spec_helper.rb +4 -0
- data/test/eval_alignment_similarity_test.rb +1 -0
- data/test/eval_similarity_test.rb +1 -0
- data/test/find_pvalue_test.rb +1 -0
- data/test/find_threshold_test.rb +1 -0
- data/test/preprocess_collection_test.rb +1 -0
- data/test/scan_collection_test.rb +1 -0
- data/test/test_helper.rb +4 -4
- metadata +10 -5
- data/Rakefile +0 -28
- data/lib/macroape/count_by_threshold.rb +0 -16
@@ -16,10 +16,10 @@ Output format:
|
|
16
16
|
<aligned 1st matrix>
|
17
17
|
<aligned 2nd matrix>
|
18
18
|
<shift> <orientation>
|
19
|
-
|
20
|
-
Examples:
|
19
|
+
|
20
|
+
Examples:
|
21
21
|
ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
22
|
-
or on windows
|
22
|
+
or on windows
|
23
23
|
type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
24
24
|
or in linux
|
25
25
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
@@ -38,20 +38,20 @@ discretization = 10
|
|
38
38
|
first_background = [1,1,1,1]
|
39
39
|
second_background = [1,1,1,1]
|
40
40
|
|
41
|
-
begin
|
41
|
+
begin
|
42
42
|
first_file = ARGV.shift
|
43
43
|
second_file = ARGV.shift
|
44
|
-
|
44
|
+
|
45
45
|
shift = ARGV.shift
|
46
46
|
orientation = ARGV.shift
|
47
|
-
|
47
|
+
|
48
48
|
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
49
49
|
raise 'You\'d specify shift' unless shift
|
50
50
|
raise 'You\'d specify orientation' unless orientation
|
51
|
-
|
51
|
+
|
52
52
|
shift = shift.to_i
|
53
53
|
orientation = orientation.to_sym
|
54
|
-
|
54
|
+
|
55
55
|
case orientation
|
56
56
|
when :direct
|
57
57
|
reverse = false
|
@@ -60,7 +60,7 @@ begin
|
|
60
60
|
else
|
61
61
|
raise 'Unknown orientation(direct/revcomp)'
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
|
65
65
|
until ARGV.empty?
|
66
66
|
case ARGV.shift
|
@@ -80,19 +80,19 @@ begin
|
|
80
80
|
second_background = ARGV.shift(4).map(&:to_f)
|
81
81
|
end
|
82
82
|
end
|
83
|
-
raise 'background should be symmetric' unless first_background == first_background.reverse
|
83
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
84
84
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
85
85
|
|
86
|
-
|
86
|
+
|
87
87
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
88
88
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
89
|
-
|
89
|
+
|
90
90
|
# if first_file == '.stdin' || second_file == '.stdin'
|
91
91
|
# r_stream, w_stream = IO.pipe
|
92
92
|
# STDIN.readlines.each{|line| w_stream.write(line)}
|
93
93
|
# w_stream.close
|
94
94
|
# end
|
95
|
-
|
95
|
+
|
96
96
|
if first_file == '.stdin'
|
97
97
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
98
98
|
# pwm_first = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
@@ -100,7 +100,7 @@ begin
|
|
100
100
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
101
101
|
pwm_first = Bioinform::PWM.new(File.read(first_file)).background(first_background).discrete(discretization)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
if second_file == '.stdin'
|
105
105
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
106
106
|
# pwm_second = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
@@ -108,21 +108,18 @@ begin
|
|
108
108
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
109
109
|
pwm_second = Bioinform::PWM.new(File.read(second_file)).background(second_background).discrete(discretization)
|
110
110
|
end
|
111
|
-
|
111
|
+
|
112
112
|
# r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
113
|
-
|
114
|
-
|
115
|
-
pwm_second.reverse_complement! if reverse
|
116
|
-
|
113
|
+
|
117
114
|
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation)
|
118
|
-
|
115
|
+
|
119
116
|
first_threshold = pwm_first.threshold(pvalue)
|
120
117
|
second_threshold = pwm_second.threshold(pvalue)
|
121
118
|
|
122
119
|
info = cmp.alignment_infos.merge( cmp.jaccard(first_threshold, second_threshold) )
|
123
|
-
|
120
|
+
|
124
121
|
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
125
|
-
|
122
|
+
|
126
123
|
rescue => err
|
127
124
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
128
125
|
end
|
@@ -17,10 +17,10 @@ Output has format:
|
|
17
17
|
<optimal alignment, the 1st matrix>
|
18
18
|
<optimal alignment, the 2nd matrix>
|
19
19
|
<shift> <orientation>
|
20
|
-
|
21
|
-
Examples:
|
20
|
+
|
21
|
+
Examples:
|
22
22
|
ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
23
|
-
or on windows
|
23
|
+
or on windows
|
24
24
|
type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
25
25
|
or in linux
|
26
26
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
@@ -40,7 +40,7 @@ discretization = 10
|
|
40
40
|
first_background = [1,1,1,1]
|
41
41
|
second_background = [1,1,1,1]
|
42
42
|
|
43
|
-
begin
|
43
|
+
begin
|
44
44
|
first_file = ARGV.shift
|
45
45
|
second_file = ARGV.shift
|
46
46
|
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
@@ -60,22 +60,22 @@ begin
|
|
60
60
|
when '-b1'
|
61
61
|
first_background = ARGV.shift(4).map(&:to_f)
|
62
62
|
when '-b2'
|
63
|
-
second_background = ARGV.shift(4).map(&:to_f)
|
63
|
+
second_background = ARGV.shift(4).map(&:to_f)
|
64
64
|
end
|
65
65
|
end
|
66
66
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
67
67
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
68
|
-
|
68
|
+
|
69
69
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
70
70
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
71
|
-
|
72
|
-
|
71
|
+
|
72
|
+
|
73
73
|
# if first_file == '.stdin' || second_file == '.stdin'
|
74
74
|
# r_stream, w_stream = IO.pipe
|
75
75
|
# STDIN.readlines.each{|line| w_stream.write(line)}
|
76
76
|
# w_stream.close
|
77
77
|
# end
|
78
|
-
|
78
|
+
|
79
79
|
if first_file == '.stdin'
|
80
80
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
81
81
|
# pwm_first = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
@@ -83,7 +83,7 @@ begin
|
|
83
83
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
84
84
|
pwm_first = Bioinform::PWM.new(File.read(first_file)).background(first_background).discrete(discretization)
|
85
85
|
end
|
86
|
-
|
86
|
+
|
87
87
|
if second_file == '.stdin'
|
88
88
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
89
89
|
# pwm_second = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
@@ -91,7 +91,7 @@ begin
|
|
91
91
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
92
92
|
pwm_second = Bioinform::PWM.new(File.read(second_file)).background(second_background).discrete(discretization)
|
93
93
|
end
|
94
|
-
|
94
|
+
|
95
95
|
r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
96
96
|
|
97
97
|
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
@@ -100,9 +100,9 @@ begin
|
|
100
100
|
second_threshold = pwm_second.threshold(pvalue)
|
101
101
|
|
102
102
|
info = cmp.jaccard(first_threshold, second_threshold)
|
103
|
-
|
103
|
+
|
104
104
|
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
105
|
-
|
105
|
+
|
106
106
|
rescue => err
|
107
107
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
108
108
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
help_string = %q{
|
2
2
|
Command-line format:
|
3
3
|
ruby find_pvalue.rb <pat-file> <threshold list> [options]
|
4
|
-
or in linux
|
4
|
+
or in linux
|
5
5
|
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
6
6
|
or on windows
|
7
7
|
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
@@ -16,7 +16,7 @@ Output format:
|
|
16
16
|
threshold_3 count_3 pvalue_3
|
17
17
|
The results are printed out in the same order as in the given threshold list.
|
18
18
|
|
19
|
-
Examples:
|
19
|
+
Examples:
|
20
20
|
ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
|
21
21
|
or on windows
|
22
22
|
type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
@@ -35,7 +35,7 @@ end
|
|
35
35
|
discretization = 10000
|
36
36
|
background = [1,1,1,1]
|
37
37
|
thresholds = []
|
38
|
-
begin
|
38
|
+
begin
|
39
39
|
filename = ARGV.shift
|
40
40
|
|
41
41
|
loop do
|
@@ -46,10 +46,10 @@ begin
|
|
46
46
|
raise StopIteration
|
47
47
|
end
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
51
51
|
raise 'You should specify at least one threshold' if thresholds.empty?
|
52
|
-
|
52
|
+
|
53
53
|
until ARGV.empty?
|
54
54
|
case ARGV.shift
|
55
55
|
when '-b'
|
@@ -61,8 +61,8 @@ begin
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
64
|
-
|
65
|
-
|
64
|
+
|
65
|
+
|
66
66
|
if filename == '.stdin'
|
67
67
|
# TODO
|
68
68
|
else
|
@@ -14,9 +14,9 @@ Options:
|
|
14
14
|
Output format:
|
15
15
|
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
16
16
|
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
17
|
-
|
18
|
-
|
19
|
-
Example:
|
17
|
+
|
18
|
+
|
19
|
+
Example:
|
20
20
|
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
21
21
|
}
|
22
22
|
|
@@ -35,7 +35,7 @@ discretization = 10000
|
|
35
35
|
begin
|
36
36
|
filename = ARGV.shift
|
37
37
|
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
38
|
-
|
38
|
+
|
39
39
|
pvalues = []
|
40
40
|
until ARGV.empty?
|
41
41
|
case ARGV.shift
|
@@ -57,18 +57,18 @@ begin
|
|
57
57
|
end
|
58
58
|
end
|
59
59
|
pvalues = default_pvalues if pvalues.empty?
|
60
|
-
|
60
|
+
|
61
61
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
62
|
-
|
62
|
+
|
63
63
|
if filename == '.stdin'
|
64
64
|
## TODO
|
65
65
|
else
|
66
66
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
67
67
|
pwm = Bioinform::PWM.new( File.read(filename) )
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
pwm.background(background)
|
71
|
-
|
71
|
+
|
72
72
|
pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
73
73
|
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
74
74
|
end
|
@@ -10,7 +10,7 @@ Options:
|
|
10
10
|
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
11
11
|
|
12
12
|
The tool stores preprocessed Macroape collection to the specified YAML-file.
|
13
|
-
|
13
|
+
|
14
14
|
Example:
|
15
15
|
ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
|
16
16
|
}
|
@@ -34,7 +34,7 @@ begin
|
|
34
34
|
folder = ARGV.shift
|
35
35
|
raise "No input. You'd specify folder with pat-files" unless folder
|
36
36
|
raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
|
37
|
-
|
37
|
+
|
38
38
|
pvalues = []
|
39
39
|
silent = false
|
40
40
|
until ARGV.empty?
|
@@ -64,7 +64,7 @@ begin
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
pvalues = default_pvalues if pvalues.empty?
|
67
|
-
|
67
|
+
|
68
68
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
69
69
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
70
70
|
|
@@ -75,14 +75,14 @@ begin
|
|
75
75
|
STDERR.puts filename unless silent
|
76
76
|
pwm = Bioinform::PWM.new(File.read(filename))
|
77
77
|
pwm.name ||= File.basename(filename, File.extname(filename))
|
78
|
-
|
79
|
-
# When support of onefile collections is introduced - then here should be check if name exists.
|
78
|
+
|
79
|
+
# When support of onefile collections is introduced - then here should be check if name exists.
|
80
80
|
# Otherwise it should skip motif and tell you about this
|
81
81
|
# Also two command line options to fail on skipping or to skip silently should be included
|
82
|
-
|
82
|
+
|
83
83
|
info = {rough: {}, precise: {}}
|
84
84
|
pwm.background(background)
|
85
|
-
|
85
|
+
|
86
86
|
pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
87
87
|
info[:rough][pvalue] = threshold / rough_discretization
|
88
88
|
end
|
@@ -90,7 +90,7 @@ begin
|
|
90
90
|
pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
91
91
|
info[:precise][pvalue] = threshold / precise_discretization
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
collection.add_pwm(pwm, info)
|
95
95
|
end
|
96
96
|
File.open(output_file,'w') do |f|
|
@@ -11,13 +11,13 @@ Options:
|
|
11
11
|
[-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
|
12
12
|
[--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
|
13
13
|
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
14
|
-
|
14
|
+
|
15
15
|
Output format:
|
16
16
|
<name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
|
17
|
-
Attention! Name can contain whitespace characters.
|
17
|
+
Attention! Name can contain whitespace characters.
|
18
18
|
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
19
|
-
|
20
|
-
Example:
|
19
|
+
|
20
|
+
Example:
|
21
21
|
ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
|
22
22
|
or in linux
|
23
23
|
cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
|
@@ -38,7 +38,7 @@ begin
|
|
38
38
|
raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
|
39
39
|
raise "No input. You'd specify input file with collection" unless collection_file
|
40
40
|
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
41
|
-
|
41
|
+
|
42
42
|
pvalue = 0.0005
|
43
43
|
cutoff = 0.05 # minimal similarity to output
|
44
44
|
collection = YAML.load_file(collection_file)
|
@@ -65,7 +65,7 @@ begin
|
|
65
65
|
silent = true
|
66
66
|
when '--precise'
|
67
67
|
precision_mode = :precise
|
68
|
-
begin
|
68
|
+
begin
|
69
69
|
Float(ARGV.first)
|
70
70
|
minimal_similarity = ARGV.shift.to_f
|
71
71
|
rescue
|
@@ -75,26 +75,26 @@ begin
|
|
75
75
|
end
|
76
76
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
77
77
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
78
|
-
|
78
|
+
|
79
79
|
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
80
|
-
|
80
|
+
|
81
81
|
if filename == '.stdin'
|
82
82
|
# query_pwm = Macroape::SingleMatrix.load_from_stdin(STDIN)
|
83
83
|
else
|
84
84
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
85
85
|
query_pwm = Bioinform::PWM.new(File.read(filename))
|
86
86
|
end
|
87
|
-
|
88
|
-
|
87
|
+
|
88
|
+
|
89
89
|
query_pwm_rough = query_pwm.background(background_query).discrete(collection.rough_discretization)
|
90
90
|
query_pwm_precise = query_pwm.background(background_query).discrete(collection.precise_discretization)
|
91
|
-
|
91
|
+
|
92
92
|
threshold = query_pwm_rough.threshold(pvalue)
|
93
93
|
threshold_precise = query_pwm_precise.threshold(pvalue)
|
94
|
-
|
94
|
+
|
95
95
|
similarities = {}
|
96
96
|
precision_file_mode = {}
|
97
|
-
|
97
|
+
|
98
98
|
collection.pwms.each_key do |name|
|
99
99
|
pwm = collection.pwms[name]
|
100
100
|
pwm_info = collection.infos[name]
|
@@ -102,7 +102,7 @@ begin
|
|
102
102
|
cmp = Macroape::PWMCompare.new(query_pwm_rough, pwm.background(collection.background).discrete(collection.rough_discretization))
|
103
103
|
info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
|
104
104
|
precision_file_mode[name] = :rough
|
105
|
-
|
105
|
+
|
106
106
|
if precision_mode == :precise and info[:similarity] >= minimal_similarity
|
107
107
|
cmp = Macroape::PWMCompare.new(query_pwm_precise, pwm.background(collection.background).discrete(collection.precise_discretization))
|
108
108
|
info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
|
@@ -110,7 +110,7 @@ begin
|
|
110
110
|
end
|
111
111
|
similarities[name] = info
|
112
112
|
end
|
113
|
-
|
113
|
+
|
114
114
|
puts "#pwm\tsimilarity\tshift\toverlap\torientation"
|
115
115
|
similarities.sort_by do |name, info|
|
116
116
|
info[:similarity]
|
@@ -118,7 +118,7 @@ begin
|
|
118
118
|
precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
|
119
119
|
puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
rescue => err
|
123
123
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
124
124
|
end
|
data/lib/macroape/pwm_compare.rb
CHANGED
@@ -13,12 +13,11 @@ module Macroape
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def each_alignment
|
16
|
-
second_rc = second.reverse_complement
|
17
16
|
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
18
|
-
yield PWMCompareAligned.new(first,
|
17
|
+
yield PWMCompareAligned.new(first, second, shift, orientation)
|
19
18
|
end
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
include Enumerable
|
23
22
|
alias_method :each, :each_alignment
|
24
23
|
alias_method :map_each_alignment, :map
|
@@ -2,31 +2,37 @@ require 'macroape/aligned_pair_intersection'
|
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
class PWMCompareAligned
|
5
|
-
attr_reader :first, :second, :length, :shift, :orientation, :
|
6
|
-
def initialize(
|
7
|
-
@unaligned_first, @unaligned_second = first, second
|
5
|
+
attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
|
6
|
+
def initialize(first_unaligned, second_unaligned, shift, orientation)
|
8
7
|
@shift, @orientation = shift, orientation
|
8
|
+
|
9
|
+
@first_length, @second_length = first_unaligned.length, second_unaligned.length
|
10
|
+
@length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
|
11
|
+
|
12
|
+
first, second = first_unaligned, second_unaligned
|
13
|
+
second = second.reverse_complement if revcomp?
|
14
|
+
|
9
15
|
if shift > 0
|
10
|
-
|
16
|
+
second = second.left_augment(shift)
|
11
17
|
else
|
12
|
-
first
|
18
|
+
first = first.left_augment(-shift)
|
13
19
|
end
|
14
|
-
|
20
|
+
|
15
21
|
@first = first.right_augment(@length - first.length)
|
16
22
|
@second = second.right_augment(@length - second.length)
|
17
23
|
end
|
18
|
-
|
24
|
+
|
19
25
|
def direct?
|
20
26
|
orientation == :direct
|
21
27
|
end
|
22
28
|
def revcomp?
|
23
29
|
orientation == :revcomp
|
24
30
|
end
|
25
|
-
|
31
|
+
|
26
32
|
def overlap
|
27
33
|
length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
|
28
34
|
end
|
29
|
-
|
35
|
+
|
30
36
|
def first_pwm_alignment
|
31
37
|
length.times.map do |pos|
|
32
38
|
if first_overlaps?(pos)
|
@@ -36,7 +42,7 @@ module Macroape
|
|
36
42
|
end
|
37
43
|
end.join
|
38
44
|
end
|
39
|
-
|
45
|
+
|
40
46
|
def second_pwm_alignment
|
41
47
|
length.times.map do |pos|
|
42
48
|
if second_overlaps?(pos)
|
@@ -46,7 +52,7 @@ module Macroape
|
|
46
52
|
end
|
47
53
|
end.join
|
48
54
|
end
|
49
|
-
|
55
|
+
|
50
56
|
def alignment_infos
|
51
57
|
{shift: shift,
|
52
58
|
orientation: orientation,
|
@@ -54,15 +60,8 @@ module Macroape
|
|
54
60
|
overlap: overlap,
|
55
61
|
alignment_length: length}
|
56
62
|
end
|
57
|
-
|
58
|
-
|
59
|
-
unaligned_first.length
|
60
|
-
end
|
61
|
-
def second_length
|
62
|
-
unaligned_second.length
|
63
|
-
end
|
64
|
-
|
65
|
-
# whether first matrix overlap specified position
|
63
|
+
|
64
|
+
# whether first matrix overlap specified position of alignment
|
66
65
|
def first_overlaps?(pos)
|
67
66
|
return false unless pos >= 0 && pos < length
|
68
67
|
if shift > 0
|
@@ -71,7 +70,7 @@ module Macroape
|
|
71
70
|
pos >= -shift && pos < -shift + first_length
|
72
71
|
end
|
73
72
|
end
|
74
|
-
|
73
|
+
|
75
74
|
def second_overlaps?(pos)
|
76
75
|
return false unless pos >= 0 && pos < length
|
77
76
|
if shift > 0
|
@@ -80,8 +79,8 @@ module Macroape
|
|
80
79
|
pos < second_length
|
81
80
|
end
|
82
81
|
end
|
83
|
-
|
84
|
-
=begin
|
82
|
+
|
83
|
+
=begin
|
85
84
|
def discrete(rate)
|
86
85
|
PWMCompareAligned.new(first.discrete(rate), second.discrete(rate))
|
87
86
|
end
|
@@ -104,13 +103,13 @@ module Macroape
|
|
104
103
|
def jaccard(first_threshold, second_threshold)
|
105
104
|
f = first.counts_by_thresholds(first_threshold).first
|
106
105
|
s = second.counts_by_thresholds(second_threshold).first
|
107
|
-
if f == 0
|
106
|
+
if f == 0 || s == 0
|
108
107
|
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
109
108
|
recognized_by_first: f,
|
110
109
|
recognized_by_second: s,
|
111
110
|
}
|
112
111
|
end
|
113
|
-
|
112
|
+
|
114
113
|
intersect = counts_for_two_matrices(first_threshold, second_threshold)
|
115
114
|
intersect = Math.sqrt(intersect[0] * intersect[1])
|
116
115
|
union = f + s - intersect
|
@@ -118,6 +117,15 @@ module Macroape
|
|
118
117
|
{ similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
|
119
118
|
recognized_by_first: f, recognized_by_second: s }
|
120
119
|
end
|
121
|
-
|
120
|
+
|
121
|
+
def self.calculate_alignment_length(first_len, second_len, shift)
|
122
|
+
if shift > 0
|
123
|
+
[first_len, second_len + shift].max
|
124
|
+
else
|
125
|
+
[first_len - shift, second_len].max
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
122
129
|
end
|
130
|
+
|
123
131
|
end
|