macroape 3.3.2 → 3.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -1
- data/Rakefile.rb +65 -0
- data/TODO.txt +20 -0
- data/benchmark/similarity_benchmark.rb +56 -0
- data/lib/macroape.rb +1 -2
- data/lib/macroape/aligned_pair_intersection.rb +43 -116
- data/lib/macroape/collection.rb +4 -4
- data/lib/macroape/{threshold_by_pvalue.rb → counting.rb} +28 -18
- data/lib/macroape/exec/eval_alignment.rb +19 -22
- data/lib/macroape/exec/eval_similarity.rb +13 -13
- data/lib/macroape/exec/find_pvalue.rb +7 -7
- data/lib/macroape/exec/find_threshold.rb +8 -8
- data/lib/macroape/exec/preprocess_collection.rb +8 -8
- data/lib/macroape/exec/scan_collection.rb +16 -16
- data/lib/macroape/pwm_compare.rb +2 -3
- data/lib/macroape/pwm_compare_aligned.rb +34 -26
- data/lib/macroape/version.rb +1 -1
- data/spec/count_distribution_spec.rb +52 -0
- data/spec/spec_helper.rb +4 -0
- data/test/eval_alignment_similarity_test.rb +1 -0
- data/test/eval_similarity_test.rb +1 -0
- data/test/find_pvalue_test.rb +1 -0
- data/test/find_threshold_test.rb +1 -0
- data/test/preprocess_collection_test.rb +1 -0
- data/test/scan_collection_test.rb +1 -0
- data/test/test_helper.rb +4 -4
- metadata +10 -5
- data/Rakefile +0 -28
- data/lib/macroape/count_by_threshold.rb +0 -16
@@ -16,10 +16,10 @@ Output format:
|
|
16
16
|
<aligned 1st matrix>
|
17
17
|
<aligned 2nd matrix>
|
18
18
|
<shift> <orientation>
|
19
|
-
|
20
|
-
Examples:
|
19
|
+
|
20
|
+
Examples:
|
21
21
|
ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
22
|
-
or on windows
|
22
|
+
or on windows
|
23
23
|
type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
24
24
|
or in linux
|
25
25
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
@@ -38,20 +38,20 @@ discretization = 10
|
|
38
38
|
first_background = [1,1,1,1]
|
39
39
|
second_background = [1,1,1,1]
|
40
40
|
|
41
|
-
begin
|
41
|
+
begin
|
42
42
|
first_file = ARGV.shift
|
43
43
|
second_file = ARGV.shift
|
44
|
-
|
44
|
+
|
45
45
|
shift = ARGV.shift
|
46
46
|
orientation = ARGV.shift
|
47
|
-
|
47
|
+
|
48
48
|
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
49
49
|
raise 'You\'d specify shift' unless shift
|
50
50
|
raise 'You\'d specify orientation' unless orientation
|
51
|
-
|
51
|
+
|
52
52
|
shift = shift.to_i
|
53
53
|
orientation = orientation.to_sym
|
54
|
-
|
54
|
+
|
55
55
|
case orientation
|
56
56
|
when :direct
|
57
57
|
reverse = false
|
@@ -60,7 +60,7 @@ begin
|
|
60
60
|
else
|
61
61
|
raise 'Unknown orientation(direct/revcomp)'
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
|
65
65
|
until ARGV.empty?
|
66
66
|
case ARGV.shift
|
@@ -80,19 +80,19 @@ begin
|
|
80
80
|
second_background = ARGV.shift(4).map(&:to_f)
|
81
81
|
end
|
82
82
|
end
|
83
|
-
raise 'background should be symmetric' unless first_background == first_background.reverse
|
83
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
84
84
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
85
85
|
|
86
|
-
|
86
|
+
|
87
87
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
88
88
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
89
|
-
|
89
|
+
|
90
90
|
# if first_file == '.stdin' || second_file == '.stdin'
|
91
91
|
# r_stream, w_stream = IO.pipe
|
92
92
|
# STDIN.readlines.each{|line| w_stream.write(line)}
|
93
93
|
# w_stream.close
|
94
94
|
# end
|
95
|
-
|
95
|
+
|
96
96
|
if first_file == '.stdin'
|
97
97
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
98
98
|
# pwm_first = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
@@ -100,7 +100,7 @@ begin
|
|
100
100
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
101
101
|
pwm_first = Bioinform::PWM.new(File.read(first_file)).background(first_background).discrete(discretization)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
if second_file == '.stdin'
|
105
105
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
106
106
|
# pwm_second = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
@@ -108,21 +108,18 @@ begin
|
|
108
108
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
109
109
|
pwm_second = Bioinform::PWM.new(File.read(second_file)).background(second_background).discrete(discretization)
|
110
110
|
end
|
111
|
-
|
111
|
+
|
112
112
|
# r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
113
|
-
|
114
|
-
|
115
|
-
pwm_second.reverse_complement! if reverse
|
116
|
-
|
113
|
+
|
117
114
|
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation)
|
118
|
-
|
115
|
+
|
119
116
|
first_threshold = pwm_first.threshold(pvalue)
|
120
117
|
second_threshold = pwm_second.threshold(pvalue)
|
121
118
|
|
122
119
|
info = cmp.alignment_infos.merge( cmp.jaccard(first_threshold, second_threshold) )
|
123
|
-
|
120
|
+
|
124
121
|
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
125
|
-
|
122
|
+
|
126
123
|
rescue => err
|
127
124
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
128
125
|
end
|
@@ -17,10 +17,10 @@ Output has format:
|
|
17
17
|
<optimal alignment, the 1st matrix>
|
18
18
|
<optimal alignment, the 2nd matrix>
|
19
19
|
<shift> <orientation>
|
20
|
-
|
21
|
-
Examples:
|
20
|
+
|
21
|
+
Examples:
|
22
22
|
ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
23
|
-
or on windows
|
23
|
+
or on windows
|
24
24
|
type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
25
25
|
or in linux
|
26
26
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
@@ -40,7 +40,7 @@ discretization = 10
|
|
40
40
|
first_background = [1,1,1,1]
|
41
41
|
second_background = [1,1,1,1]
|
42
42
|
|
43
|
-
begin
|
43
|
+
begin
|
44
44
|
first_file = ARGV.shift
|
45
45
|
second_file = ARGV.shift
|
46
46
|
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
@@ -60,22 +60,22 @@ begin
|
|
60
60
|
when '-b1'
|
61
61
|
first_background = ARGV.shift(4).map(&:to_f)
|
62
62
|
when '-b2'
|
63
|
-
second_background = ARGV.shift(4).map(&:to_f)
|
63
|
+
second_background = ARGV.shift(4).map(&:to_f)
|
64
64
|
end
|
65
65
|
end
|
66
66
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
67
67
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
68
|
-
|
68
|
+
|
69
69
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
70
70
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
71
|
-
|
72
|
-
|
71
|
+
|
72
|
+
|
73
73
|
# if first_file == '.stdin' || second_file == '.stdin'
|
74
74
|
# r_stream, w_stream = IO.pipe
|
75
75
|
# STDIN.readlines.each{|line| w_stream.write(line)}
|
76
76
|
# w_stream.close
|
77
77
|
# end
|
78
|
-
|
78
|
+
|
79
79
|
if first_file == '.stdin'
|
80
80
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
81
81
|
# pwm_first = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
|
@@ -83,7 +83,7 @@ begin
|
|
83
83
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
84
84
|
pwm_first = Bioinform::PWM.new(File.read(first_file)).background(first_background).discrete(discretization)
|
85
85
|
end
|
86
|
-
|
86
|
+
|
87
87
|
if second_file == '.stdin'
|
88
88
|
# r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
|
89
89
|
# pwm_second = Macroape::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
|
@@ -91,7 +91,7 @@ begin
|
|
91
91
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
92
92
|
pwm_second = Bioinform::PWM.new(File.read(second_file)).background(second_background).discrete(discretization)
|
93
93
|
end
|
94
|
-
|
94
|
+
|
95
95
|
r_stream.close if first_file == '.stdin' || second_file == '.stdin'
|
96
96
|
|
97
97
|
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
@@ -100,9 +100,9 @@ begin
|
|
100
100
|
second_threshold = pwm_second.threshold(pvalue)
|
101
101
|
|
102
102
|
info = cmp.jaccard(first_threshold, second_threshold)
|
103
|
-
|
103
|
+
|
104
104
|
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
105
|
-
|
105
|
+
|
106
106
|
rescue => err
|
107
107
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
108
108
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
help_string = %q{
|
2
2
|
Command-line format:
|
3
3
|
ruby find_pvalue.rb <pat-file> <threshold list> [options]
|
4
|
-
or in linux
|
4
|
+
or in linux
|
5
5
|
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
6
6
|
or on windows
|
7
7
|
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
@@ -16,7 +16,7 @@ Output format:
|
|
16
16
|
threshold_3 count_3 pvalue_3
|
17
17
|
The results are printed out in the same order as in the given threshold list.
|
18
18
|
|
19
|
-
Examples:
|
19
|
+
Examples:
|
20
20
|
ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
|
21
21
|
or on windows
|
22
22
|
type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
@@ -35,7 +35,7 @@ end
|
|
35
35
|
discretization = 10000
|
36
36
|
background = [1,1,1,1]
|
37
37
|
thresholds = []
|
38
|
-
begin
|
38
|
+
begin
|
39
39
|
filename = ARGV.shift
|
40
40
|
|
41
41
|
loop do
|
@@ -46,10 +46,10 @@ begin
|
|
46
46
|
raise StopIteration
|
47
47
|
end
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
51
51
|
raise 'You should specify at least one threshold' if thresholds.empty?
|
52
|
-
|
52
|
+
|
53
53
|
until ARGV.empty?
|
54
54
|
case ARGV.shift
|
55
55
|
when '-b'
|
@@ -61,8 +61,8 @@ begin
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
64
|
-
|
65
|
-
|
64
|
+
|
65
|
+
|
66
66
|
if filename == '.stdin'
|
67
67
|
# TODO
|
68
68
|
else
|
@@ -14,9 +14,9 @@ Options:
|
|
14
14
|
Output format:
|
15
15
|
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
16
16
|
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
17
|
-
|
18
|
-
|
19
|
-
Example:
|
17
|
+
|
18
|
+
|
19
|
+
Example:
|
20
20
|
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
21
21
|
}
|
22
22
|
|
@@ -35,7 +35,7 @@ discretization = 10000
|
|
35
35
|
begin
|
36
36
|
filename = ARGV.shift
|
37
37
|
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
38
|
-
|
38
|
+
|
39
39
|
pvalues = []
|
40
40
|
until ARGV.empty?
|
41
41
|
case ARGV.shift
|
@@ -57,18 +57,18 @@ begin
|
|
57
57
|
end
|
58
58
|
end
|
59
59
|
pvalues = default_pvalues if pvalues.empty?
|
60
|
-
|
60
|
+
|
61
61
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
62
|
-
|
62
|
+
|
63
63
|
if filename == '.stdin'
|
64
64
|
## TODO
|
65
65
|
else
|
66
66
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
67
67
|
pwm = Bioinform::PWM.new( File.read(filename) )
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
pwm.background(background)
|
71
|
-
|
71
|
+
|
72
72
|
pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
73
73
|
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
74
74
|
end
|
@@ -10,7 +10,7 @@ Options:
|
|
10
10
|
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
11
11
|
|
12
12
|
The tool stores preprocessed Macroape collection to the specified YAML-file.
|
13
|
-
|
13
|
+
|
14
14
|
Example:
|
15
15
|
ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
|
16
16
|
}
|
@@ -34,7 +34,7 @@ begin
|
|
34
34
|
folder = ARGV.shift
|
35
35
|
raise "No input. You'd specify folder with pat-files" unless folder
|
36
36
|
raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
|
37
|
-
|
37
|
+
|
38
38
|
pvalues = []
|
39
39
|
silent = false
|
40
40
|
until ARGV.empty?
|
@@ -64,7 +64,7 @@ begin
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
pvalues = default_pvalues if pvalues.empty?
|
67
|
-
|
67
|
+
|
68
68
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
69
69
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
70
70
|
|
@@ -75,14 +75,14 @@ begin
|
|
75
75
|
STDERR.puts filename unless silent
|
76
76
|
pwm = Bioinform::PWM.new(File.read(filename))
|
77
77
|
pwm.name ||= File.basename(filename, File.extname(filename))
|
78
|
-
|
79
|
-
# When support of onefile collections is introduced - then here should be check if name exists.
|
78
|
+
|
79
|
+
# When support of onefile collections is introduced - then here should be check if name exists.
|
80
80
|
# Otherwise it should skip motif and tell you about this
|
81
81
|
# Also two command line options to fail on skipping or to skip silently should be included
|
82
|
-
|
82
|
+
|
83
83
|
info = {rough: {}, precise: {}}
|
84
84
|
pwm.background(background)
|
85
|
-
|
85
|
+
|
86
86
|
pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
87
87
|
info[:rough][pvalue] = threshold / rough_discretization
|
88
88
|
end
|
@@ -90,7 +90,7 @@ begin
|
|
90
90
|
pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
91
91
|
info[:precise][pvalue] = threshold / precise_discretization
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
collection.add_pwm(pwm, info)
|
95
95
|
end
|
96
96
|
File.open(output_file,'w') do |f|
|
@@ -11,13 +11,13 @@ Options:
|
|
11
11
|
[-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
|
12
12
|
[--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
|
13
13
|
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
14
|
-
|
14
|
+
|
15
15
|
Output format:
|
16
16
|
<name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
|
17
|
-
Attention! Name can contain whitespace characters.
|
17
|
+
Attention! Name can contain whitespace characters.
|
18
18
|
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
19
|
-
|
20
|
-
Example:
|
19
|
+
|
20
|
+
Example:
|
21
21
|
ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
|
22
22
|
or in linux
|
23
23
|
cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
|
@@ -38,7 +38,7 @@ begin
|
|
38
38
|
raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
|
39
39
|
raise "No input. You'd specify input file with collection" unless collection_file
|
40
40
|
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
41
|
-
|
41
|
+
|
42
42
|
pvalue = 0.0005
|
43
43
|
cutoff = 0.05 # minimal similarity to output
|
44
44
|
collection = YAML.load_file(collection_file)
|
@@ -65,7 +65,7 @@ begin
|
|
65
65
|
silent = true
|
66
66
|
when '--precise'
|
67
67
|
precision_mode = :precise
|
68
|
-
begin
|
68
|
+
begin
|
69
69
|
Float(ARGV.first)
|
70
70
|
minimal_similarity = ARGV.shift.to_f
|
71
71
|
rescue
|
@@ -75,26 +75,26 @@ begin
|
|
75
75
|
end
|
76
76
|
Macroape::MaxHashSizeSingle = 1000000 unless defined? Macroape::MaxHashSizeSingle
|
77
77
|
Macroape::MaxHashSizeDouble = 1000 unless defined? Macroape::MaxHashSizeDouble
|
78
|
-
|
78
|
+
|
79
79
|
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
80
|
-
|
80
|
+
|
81
81
|
if filename == '.stdin'
|
82
82
|
# query_pwm = Macroape::SingleMatrix.load_from_stdin(STDIN)
|
83
83
|
else
|
84
84
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
85
85
|
query_pwm = Bioinform::PWM.new(File.read(filename))
|
86
86
|
end
|
87
|
-
|
88
|
-
|
87
|
+
|
88
|
+
|
89
89
|
query_pwm_rough = query_pwm.background(background_query).discrete(collection.rough_discretization)
|
90
90
|
query_pwm_precise = query_pwm.background(background_query).discrete(collection.precise_discretization)
|
91
|
-
|
91
|
+
|
92
92
|
threshold = query_pwm_rough.threshold(pvalue)
|
93
93
|
threshold_precise = query_pwm_precise.threshold(pvalue)
|
94
|
-
|
94
|
+
|
95
95
|
similarities = {}
|
96
96
|
precision_file_mode = {}
|
97
|
-
|
97
|
+
|
98
98
|
collection.pwms.each_key do |name|
|
99
99
|
pwm = collection.pwms[name]
|
100
100
|
pwm_info = collection.infos[name]
|
@@ -102,7 +102,7 @@ begin
|
|
102
102
|
cmp = Macroape::PWMCompare.new(query_pwm_rough, pwm.background(collection.background).discrete(collection.rough_discretization))
|
103
103
|
info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
|
104
104
|
precision_file_mode[name] = :rough
|
105
|
-
|
105
|
+
|
106
106
|
if precision_mode == :precise and info[:similarity] >= minimal_similarity
|
107
107
|
cmp = Macroape::PWMCompare.new(query_pwm_precise, pwm.background(collection.background).discrete(collection.precise_discretization))
|
108
108
|
info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
|
@@ -110,7 +110,7 @@ begin
|
|
110
110
|
end
|
111
111
|
similarities[name] = info
|
112
112
|
end
|
113
|
-
|
113
|
+
|
114
114
|
puts "#pwm\tsimilarity\tshift\toverlap\torientation"
|
115
115
|
similarities.sort_by do |name, info|
|
116
116
|
info[:similarity]
|
@@ -118,7 +118,7 @@ begin
|
|
118
118
|
precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
|
119
119
|
puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
rescue => err
|
123
123
|
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
124
124
|
end
|
data/lib/macroape/pwm_compare.rb
CHANGED
@@ -13,12 +13,11 @@ module Macroape
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def each_alignment
|
16
|
-
second_rc = second.reverse_complement
|
17
16
|
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
18
|
-
yield PWMCompareAligned.new(first,
|
17
|
+
yield PWMCompareAligned.new(first, second, shift, orientation)
|
19
18
|
end
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
include Enumerable
|
23
22
|
alias_method :each, :each_alignment
|
24
23
|
alias_method :map_each_alignment, :map
|
@@ -2,31 +2,37 @@ require 'macroape/aligned_pair_intersection'
|
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
class PWMCompareAligned
|
5
|
-
attr_reader :first, :second, :length, :shift, :orientation, :
|
6
|
-
def initialize(
|
7
|
-
@unaligned_first, @unaligned_second = first, second
|
5
|
+
attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
|
6
|
+
def initialize(first_unaligned, second_unaligned, shift, orientation)
|
8
7
|
@shift, @orientation = shift, orientation
|
8
|
+
|
9
|
+
@first_length, @second_length = first_unaligned.length, second_unaligned.length
|
10
|
+
@length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
|
11
|
+
|
12
|
+
first, second = first_unaligned, second_unaligned
|
13
|
+
second = second.reverse_complement if revcomp?
|
14
|
+
|
9
15
|
if shift > 0
|
10
|
-
|
16
|
+
second = second.left_augment(shift)
|
11
17
|
else
|
12
|
-
first
|
18
|
+
first = first.left_augment(-shift)
|
13
19
|
end
|
14
|
-
|
20
|
+
|
15
21
|
@first = first.right_augment(@length - first.length)
|
16
22
|
@second = second.right_augment(@length - second.length)
|
17
23
|
end
|
18
|
-
|
24
|
+
|
19
25
|
def direct?
|
20
26
|
orientation == :direct
|
21
27
|
end
|
22
28
|
def revcomp?
|
23
29
|
orientation == :revcomp
|
24
30
|
end
|
25
|
-
|
31
|
+
|
26
32
|
def overlap
|
27
33
|
length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
|
28
34
|
end
|
29
|
-
|
35
|
+
|
30
36
|
def first_pwm_alignment
|
31
37
|
length.times.map do |pos|
|
32
38
|
if first_overlaps?(pos)
|
@@ -36,7 +42,7 @@ module Macroape
|
|
36
42
|
end
|
37
43
|
end.join
|
38
44
|
end
|
39
|
-
|
45
|
+
|
40
46
|
def second_pwm_alignment
|
41
47
|
length.times.map do |pos|
|
42
48
|
if second_overlaps?(pos)
|
@@ -46,7 +52,7 @@ module Macroape
|
|
46
52
|
end
|
47
53
|
end.join
|
48
54
|
end
|
49
|
-
|
55
|
+
|
50
56
|
def alignment_infos
|
51
57
|
{shift: shift,
|
52
58
|
orientation: orientation,
|
@@ -54,15 +60,8 @@ module Macroape
|
|
54
60
|
overlap: overlap,
|
55
61
|
alignment_length: length}
|
56
62
|
end
|
57
|
-
|
58
|
-
|
59
|
-
unaligned_first.length
|
60
|
-
end
|
61
|
-
def second_length
|
62
|
-
unaligned_second.length
|
63
|
-
end
|
64
|
-
|
65
|
-
# whether first matrix overlap specified position
|
63
|
+
|
64
|
+
# whether first matrix overlap specified position of alignment
|
66
65
|
def first_overlaps?(pos)
|
67
66
|
return false unless pos >= 0 && pos < length
|
68
67
|
if shift > 0
|
@@ -71,7 +70,7 @@ module Macroape
|
|
71
70
|
pos >= -shift && pos < -shift + first_length
|
72
71
|
end
|
73
72
|
end
|
74
|
-
|
73
|
+
|
75
74
|
def second_overlaps?(pos)
|
76
75
|
return false unless pos >= 0 && pos < length
|
77
76
|
if shift > 0
|
@@ -80,8 +79,8 @@ module Macroape
|
|
80
79
|
pos < second_length
|
81
80
|
end
|
82
81
|
end
|
83
|
-
|
84
|
-
=begin
|
82
|
+
|
83
|
+
=begin
|
85
84
|
def discrete(rate)
|
86
85
|
PWMCompareAligned.new(first.discrete(rate), second.discrete(rate))
|
87
86
|
end
|
@@ -104,13 +103,13 @@ module Macroape
|
|
104
103
|
def jaccard(first_threshold, second_threshold)
|
105
104
|
f = first.counts_by_thresholds(first_threshold).first
|
106
105
|
s = second.counts_by_thresholds(second_threshold).first
|
107
|
-
if f == 0
|
106
|
+
if f == 0 || s == 0
|
108
107
|
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
109
108
|
recognized_by_first: f,
|
110
109
|
recognized_by_second: s,
|
111
110
|
}
|
112
111
|
end
|
113
|
-
|
112
|
+
|
114
113
|
intersect = counts_for_two_matrices(first_threshold, second_threshold)
|
115
114
|
intersect = Math.sqrt(intersect[0] * intersect[1])
|
116
115
|
union = f + s - intersect
|
@@ -118,6 +117,15 @@ module Macroape
|
|
118
117
|
{ similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
|
119
118
|
recognized_by_first: f, recognized_by_second: s }
|
120
119
|
end
|
121
|
-
|
120
|
+
|
121
|
+
def self.calculate_alignment_length(first_len, second_len, shift)
|
122
|
+
if shift > 0
|
123
|
+
[first_len, second_len + shift].max
|
124
|
+
else
|
125
|
+
[first_len - shift, second_len].max
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
122
129
|
end
|
130
|
+
|
123
131
|
end
|