macroape 3.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +61 -0
- data/Rakefile +7 -0
- data/bin/eval_alignment +3 -0
- data/bin/eval_similarity +3 -0
- data/bin/find_pvalue +3 -0
- data/bin/find_threshold +3 -0
- data/bin/preprocess_collection +3 -0
- data/bin/scan_collection +3 -0
- data/lib/macroape/aligned_pair_intersection.rb +136 -0
- data/lib/macroape/aligned_pair_metrics.rb +24 -0
- data/lib/macroape/aligned_pair_transformations.rb +23 -0
- data/lib/macroape/collection.rb +15 -0
- data/lib/macroape/count_by_threshold.rb +34 -0
- data/lib/macroape/exec/eval_alignment.rb +141 -0
- data/lib/macroape/exec/eval_similarity.rb +107 -0
- data/lib/macroape/exec/find_pvalue.rb +80 -0
- data/lib/macroape/exec/find_threshold.rb +76 -0
- data/lib/macroape/exec/preprocess_collection.rb +94 -0
- data/lib/macroape/exec/scan_collection.rb +124 -0
- data/lib/macroape/extract_pwm.rb +32 -0
- data/lib/macroape/gauss_estimation.rb +30 -0
- data/lib/macroape/matrix_information.rb +29 -0
- data/lib/macroape/matrix_on_background.rb +16 -0
- data/lib/macroape/matrix_transformations.rb +29 -0
- data/lib/macroape/pair_metrics.rb +9 -0
- data/lib/macroape/pair_transformations.rb +28 -0
- data/lib/macroape/pwm_compare.rb +10 -0
- data/lib/macroape/pwm_compare_aligned.rb +13 -0
- data/lib/macroape/single_matrix.rb +45 -0
- data/lib/macroape/support.rb +34 -0
- data/lib/macroape/threshold_by_pvalue.rb +68 -0
- data/lib/macroape/version.rb +3 -0
- data/lib/macroape.rb +26 -0
- data/macroape.gemspec +17 -0
- data/test/data/AHR_si.pat +10 -0
- data/test/data/KLF4_f2.pat +11 -0
- data/test/data/KLF4_f2_scan_results_all.txt +4 -0
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
- data/test/data/SP1_f1.pat +12 -0
- data/test/data/SP1_f1_revcomp.pat +12 -0
- data/test/data/test_collection/GABPA_f1.pat +14 -0
- data/test/data/test_collection/KLF4_f2.pat +11 -0
- data/test/data/test_collection/SP1_f1.pat +12 -0
- data/test/data/test_collection.yaml +186 -0
- data/test/macroape_test.rb +125 -0
- metadata +116 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby find_pvalue.rb <pat-file> <threshold list> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-d <discretization level>]
|
11
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
12
|
+
|
13
|
+
Output format:
|
14
|
+
threshold_1 count_1 pvalue_1
|
15
|
+
threshold_2 count_2 pvalue_2
|
16
|
+
threshold_3 count_3 pvalue_3
|
17
|
+
The results are printed out in the same order as in the given threshold list.
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
|
21
|
+
or on windows
|
22
|
+
type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
23
|
+
or in linux
|
24
|
+
cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
25
|
+
}
|
26
|
+
|
27
|
+
require 'macroape'
|
28
|
+
|
29
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
30
|
+
STDERR.puts help_string
|
31
|
+
exit
|
32
|
+
end
|
33
|
+
|
34
|
+
discretization = 10000
|
35
|
+
background = [1,1,1,1]
|
36
|
+
thresholds = []
|
37
|
+
begin
|
38
|
+
filename = ARGV.shift
|
39
|
+
|
40
|
+
loop do
|
41
|
+
begin
|
42
|
+
Float(ARGV.first)
|
43
|
+
thresholds << ARGV.shift.to_f
|
44
|
+
rescue
|
45
|
+
raise StopIteration
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
50
|
+
raise 'You should specify at least one threshold' if thresholds.empty?
|
51
|
+
|
52
|
+
until ARGV.empty?
|
53
|
+
case ARGV.shift
|
54
|
+
when '-b'
|
55
|
+
background = ARGV.shift(4).map(&:to_f)
|
56
|
+
when '-d'
|
57
|
+
discretization = ARGV.shift.to_f
|
58
|
+
when '-m'
|
59
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
60
|
+
end
|
61
|
+
end
|
62
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
63
|
+
|
64
|
+
|
65
|
+
if filename == '.stdin'
|
66
|
+
pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
67
|
+
else
|
68
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
69
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
70
|
+
end
|
71
|
+
pwm = pwm.with_background(background)
|
72
|
+
|
73
|
+
counts = pwm.discrete(discretization).counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
74
|
+
pvalues = counts.map{|count| count.to_f / pwm.number_of_words}
|
75
|
+
pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
|
76
|
+
puts "#{threshold}\t#{count}\t#{pvalue}"
|
77
|
+
}
|
78
|
+
rescue => err
|
79
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
80
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format::
|
3
|
+
ruby find_threshold.rb <pat-file> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby find_threshold.rb .stdin [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby find_threshold.rb .stdin [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-p <list of P-values>]
|
11
|
+
[-d <discretization level>]
|
12
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
13
|
+
|
14
|
+
Output format:
|
15
|
+
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
16
|
+
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
17
|
+
|
18
|
+
|
19
|
+
Example:
|
20
|
+
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
21
|
+
}
|
22
|
+
|
23
|
+
require 'macroape'
|
24
|
+
|
25
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
26
|
+
STDERR.puts help_string
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
background = [1,1,1,1]
|
31
|
+
default_pvalues = [0.0005]
|
32
|
+
discretization = 10000
|
33
|
+
|
34
|
+
begin
|
35
|
+
filename = ARGV.shift
|
36
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
37
|
+
|
38
|
+
pvalues = []
|
39
|
+
until ARGV.empty?
|
40
|
+
case ARGV.shift
|
41
|
+
when '-b'
|
42
|
+
background = ARGV.shift(4).map(&:to_f)
|
43
|
+
when '-m'
|
44
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
45
|
+
when '-p'
|
46
|
+
loop do
|
47
|
+
begin
|
48
|
+
Float(ARGV.first)
|
49
|
+
pvalues << ARGV.shift.to_f
|
50
|
+
rescue
|
51
|
+
raise StopIteration
|
52
|
+
end
|
53
|
+
end
|
54
|
+
when '-d'
|
55
|
+
discretization = ARGV.shift.to_f
|
56
|
+
end
|
57
|
+
end
|
58
|
+
pvalues = default_pvalues if pvalues.empty?
|
59
|
+
|
60
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
61
|
+
|
62
|
+
if filename == '.stdin'
|
63
|
+
pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
64
|
+
else
|
65
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
66
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
67
|
+
end
|
68
|
+
|
69
|
+
pwm = pwm.with_background(background)
|
70
|
+
|
71
|
+
pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
72
|
+
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
73
|
+
end
|
74
|
+
rescue => err
|
75
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
76
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby preprocess_collection.rb <folder with PWMs> [options]
|
4
|
+
|
5
|
+
Options:
|
6
|
+
[-p <list of P-values>]
|
7
|
+
[-d <rough discretization> <precise discretization>]
|
8
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
9
|
+
[-o <output file>]
|
10
|
+
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
11
|
+
|
12
|
+
The tool stores preprocessed PWM collection to the specified YAML-file.
|
13
|
+
|
14
|
+
Example:
|
15
|
+
ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
|
16
|
+
}
|
17
|
+
|
18
|
+
require 'macroape'
|
19
|
+
|
20
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
21
|
+
STDERR.puts help_string
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
|
25
|
+
default_pvalues = [0.0005]
|
26
|
+
background = [1,1,1,1]
|
27
|
+
rough_discretization = 1
|
28
|
+
precise_discretization = 10
|
29
|
+
output_file = 'collection.yaml'
|
30
|
+
|
31
|
+
begin
|
32
|
+
folder = ARGV.shift
|
33
|
+
raise "No input. You'd specify folder with pat-files" unless folder
|
34
|
+
raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
|
35
|
+
|
36
|
+
pvalues = []
|
37
|
+
silent = false
|
38
|
+
until ARGV.empty?
|
39
|
+
case ARGV.shift
|
40
|
+
when '-b'
|
41
|
+
background = ARGV.shift(4).map(&:to_f)
|
42
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
|
43
|
+
when '-p'
|
44
|
+
loop do
|
45
|
+
begin
|
46
|
+
Float(ARGV.first)
|
47
|
+
pvalues << ARGV.shift.to_f
|
48
|
+
rescue
|
49
|
+
raise StopIteration
|
50
|
+
end
|
51
|
+
end
|
52
|
+
when '-d'
|
53
|
+
rough_discretization, precise_discretization = ARGV.shift(2).map(&:to_f).sort
|
54
|
+
when '-o'
|
55
|
+
output_file = ARGV.shift
|
56
|
+
when '-m'
|
57
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
58
|
+
when '-md'
|
59
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
60
|
+
when '--silent'
|
61
|
+
silent = true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
pvalues = default_pvalues if pvalues.empty?
|
65
|
+
|
66
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
67
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
68
|
+
|
69
|
+
collection = PWM::Collection.new(rough_discretization, precise_discretization, background, pvalues)
|
70
|
+
|
71
|
+
current_dir = File.dirname(__FILE__)
|
72
|
+
Dir.glob(File.join(folder,'*')) do |filename|
|
73
|
+
STDERR.puts filename unless silent
|
74
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
75
|
+
info = {rough: {}, precise: {}}
|
76
|
+
output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{rough_discretization}`.split("\n")
|
77
|
+
output.each do |line|
|
78
|
+
pvalue, threshold, real_pvalue = line.split.map(&:to_f)
|
79
|
+
info[:rough][pvalue] = threshold
|
80
|
+
end
|
81
|
+
|
82
|
+
output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{precise_discretization}`.split("\n")
|
83
|
+
output.each do |line|
|
84
|
+
pvalue, threshold, real_pvalue = line.split.map(&:to_f)
|
85
|
+
info[:precise][pvalue] = threshold
|
86
|
+
end
|
87
|
+
collection.add_pwm(pwm, info)
|
88
|
+
end
|
89
|
+
File.open(output_file,'w') do |f|
|
90
|
+
f.puts(collection.to_yaml)
|
91
|
+
end
|
92
|
+
rescue => err
|
93
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
94
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby scan_collection.rb <pat-file> <collection> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-p <P-value>]
|
11
|
+
[-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
|
12
|
+
[--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
|
13
|
+
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
14
|
+
|
15
|
+
Output format:
|
16
|
+
<name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
|
17
|
+
Attention! Name can contain whitespace characters.
|
18
|
+
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
|
22
|
+
or in linux
|
23
|
+
cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
|
24
|
+
}
|
25
|
+
|
26
|
+
require 'macroape'
|
27
|
+
|
28
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
29
|
+
STDERR.puts help_string
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
filename = ARGV.shift
|
35
|
+
collection_file = ARGV.shift
|
36
|
+
raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
|
37
|
+
raise "No input. You'd specify input file with collection" unless collection_file
|
38
|
+
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
39
|
+
|
40
|
+
pvalue = 0.0005
|
41
|
+
cutoff = 0.05 # minimal similarity to output
|
42
|
+
collection = YAML.load_file(collection_file)
|
43
|
+
background_query = collection.background
|
44
|
+
|
45
|
+
silent = false
|
46
|
+
precision_mode = :rough
|
47
|
+
until ARGV.empty?
|
48
|
+
case ARGV.shift
|
49
|
+
when '-bq'
|
50
|
+
background_query = ARGV.shift(4).map(&:to_f)
|
51
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
|
52
|
+
when '-p'
|
53
|
+
pvalue = ARGV.shift.to_f
|
54
|
+
when '-m'
|
55
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
56
|
+
when '-md'
|
57
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
58
|
+
when '-c'
|
59
|
+
cutoff = ARGV.shift.to_f
|
60
|
+
when '--all'
|
61
|
+
cutoff = 0.0
|
62
|
+
when '--silent'
|
63
|
+
silent = true
|
64
|
+
when '--precise'
|
65
|
+
precision_mode = :precise
|
66
|
+
begin
|
67
|
+
Float(ARGV.first)
|
68
|
+
minimal_similarity = ARGV.shift.to_f
|
69
|
+
rescue
|
70
|
+
minimal_similarity = 0.05
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
75
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
76
|
+
|
77
|
+
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
78
|
+
|
79
|
+
if filename == '.stdin'
|
80
|
+
query_pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
81
|
+
else
|
82
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
83
|
+
query_pwm = PWM::SingleMatrix.load_pat(filename)
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
query_pwm_rough = query_pwm.with_background(background_query).discrete(collection.rough_discretization)
|
88
|
+
query_pwm_precise = query_pwm.with_background(background_query).discrete(collection.precise_discretization)
|
89
|
+
|
90
|
+
threshold = query_pwm_rough.threshold(pvalue)
|
91
|
+
threshold_precise = query_pwm_precise.threshold(pvalue)
|
92
|
+
|
93
|
+
similarities = {}
|
94
|
+
precision_file_mode = {}
|
95
|
+
unnamed_index = 0
|
96
|
+
|
97
|
+
collection.pwms.each_key do |name|
|
98
|
+
pwm = collection.pwms[name]
|
99
|
+
pwm_info = collection.infos[name]
|
100
|
+
STDERR.puts pwm.name unless silent
|
101
|
+
cmp = PWMCompare::PWMCompare.new(query_pwm_rough, pwm.with_background(collection.background).discrete(collection.rough_discretization))
|
102
|
+
info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
|
103
|
+
name = pwm.name || "Unnamed #{unnamed_index += 1}"
|
104
|
+
precision_file_mode[name] = :rough
|
105
|
+
|
106
|
+
if precision_mode == :precise and info[:similarity] >= minimal_similarity
|
107
|
+
cmp = PWMCompare::PWMCompare.new(query_pwm_precise, pwm.with_background(collection.background).discrete(collection.precise_discretization))
|
108
|
+
info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
|
109
|
+
precision_file_mode[name] = :precise
|
110
|
+
end
|
111
|
+
similarities[name] = info
|
112
|
+
end
|
113
|
+
|
114
|
+
puts "#pwm\tsimilarity\tshift\toverlap\torientation"
|
115
|
+
similarities.sort_by do |name, info|
|
116
|
+
info[:similarity]
|
117
|
+
end.reverse.each do |name, info|
|
118
|
+
precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
|
119
|
+
puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
|
120
|
+
end
|
121
|
+
|
122
|
+
rescue => err
|
123
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
124
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# r_stream, w_stream - supposed to be a pipe. Data's read from r_stream, pwm's extracted, remaining data pushed back to w_stream
|
2
|
+
# ... --> w_stream --> r_stream --> data
|
3
|
+
# ^ |
|
4
|
+
# | v
|
5
|
+
# ... <-- w_stream <-- ... --> extracted pwm
|
6
|
+
def extract_pwm(r_stream, w_stream)
|
7
|
+
lines = r_stream.readlines
|
8
|
+
return [r_stream, w_stream, nil] if lines.empty?
|
9
|
+
|
10
|
+
extracted_pwm = [lines.shift]
|
11
|
+
while extracted_pwm.last.chomp == ''
|
12
|
+
extracted_pwm = [lines.shift.strip]
|
13
|
+
return [r_stream, w_stream, nil] unless extracted_pwm.last
|
14
|
+
end
|
15
|
+
|
16
|
+
r_stream.close
|
17
|
+
begin
|
18
|
+
until lines.empty?
|
19
|
+
line = lines.shift
|
20
|
+
line.split.each{|x| Float(x) } # raises error if string is not a numeric
|
21
|
+
raise 'Not a PWM string (too little number of numbers - may be empty string or name of next pwm). PWM finished' if line.split.size < 4
|
22
|
+
extracted_pwm << line
|
23
|
+
end
|
24
|
+
rescue
|
25
|
+
lines.unshift(line)
|
26
|
+
end
|
27
|
+
new_r_stream, new_w_stream = IO.pipe
|
28
|
+
lines.each{|one_line| new_w_stream.write(one_line)}
|
29
|
+
new_w_stream.close
|
30
|
+
|
31
|
+
[new_r_stream, new_w_stream, extracted_pwm]
|
32
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module PWM
|
2
|
+
module GaussEstimation
|
3
|
+
def score_mean
|
4
|
+
bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
|
5
|
+
matrix.inject(0.0){ |mean, col| mean + 4.times.inject(0.0){|sum,letter| sum + col[letter] * bckgr[letter]} }
|
6
|
+
end
|
7
|
+
def score_variance
|
8
|
+
bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
|
9
|
+
matrix.inject(0.0) do |variance, col|
|
10
|
+
variance + 4.times.inject(0.0) { |sum,letter| sum + col[letter]**2 * bckgr[letter] } -
|
11
|
+
4.times.inject(0.0) { |sum,letter| sum + col[letter] * bckgr[letter] }**2
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def threshold_gauss_estimation(pvalue)
|
15
|
+
sigma = Math.sqrt(score_variance)
|
16
|
+
n_ = inverf2(1 - 2 * pvalue) * Math.sqrt(2)
|
17
|
+
score_mean + n_ * sigma
|
18
|
+
end
|
19
|
+
def inverf2(x)
|
20
|
+
sign = x < 0 ? -1 : 1
|
21
|
+
x = x.abs
|
22
|
+
a = 8 / (3*Math::PI) * (Math::PI-3) / (4-Math::PI)
|
23
|
+
part0 = ( 2/(Math::PI*a) + (Math.log(1-x*x)) / 2 )**2
|
24
|
+
part = -2 / (Math::PI * a) - Math.log(1-x*x)/2 + Math.sqrt(-1/a *
|
25
|
+
Math.log(1-x*x) + part0)
|
26
|
+
sign * Math.sqrt(part)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PWM
|
2
|
+
module MatrixInformation
|
3
|
+
def length
|
4
|
+
@length ||= matrix.length
|
5
|
+
end
|
6
|
+
def best_score
|
7
|
+
@best_score ||= matrix.inject(0){|sum, col| sum + col.max}
|
8
|
+
end
|
9
|
+
def worst_score
|
10
|
+
@worst_score ||= matrix.inject(0){|sum, col| sum + col.min}
|
11
|
+
end
|
12
|
+
def best_suffix
|
13
|
+
return @best_suffix if @best_suffix
|
14
|
+
@best_suffix = Array.new(length + 1, 0) # best score of suffix s[i..l]
|
15
|
+
length.times{|i| @best_suffix[length - i - 1] = matrix[length - i - 1].max + @best_suffix[length - i] }
|
16
|
+
@best_suffix
|
17
|
+
end
|
18
|
+
def worst_suffix
|
19
|
+
return @worst_suffix if @worst_suffix
|
20
|
+
@worst_suffix = Array.new(length + 1, 0)
|
21
|
+
length.times{|i| @worst_suffix[length - i - 1] = matrix[length - i - 1].min + @worst_suffix[length - i] }
|
22
|
+
@worst_suffix
|
23
|
+
end
|
24
|
+
def refresh_infos
|
25
|
+
@length = @best_score = @worst_score = @best_suffix = @worst_suffix = nil
|
26
|
+
self
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module PWM
|
2
|
+
class MatrixOnBackground < SingleMatrix
|
3
|
+
attr_reader :probabilities
|
4
|
+
def initialize(matrix,background)
|
5
|
+
super(matrix)
|
6
|
+
@probabilities = background
|
7
|
+
end
|
8
|
+
def sum_of_probabilities
|
9
|
+
@sum_of_probabilities ||= probabilities.inject(0.0, &:+)
|
10
|
+
end
|
11
|
+
def number_of_words
|
12
|
+
sum_of_probabilities ** length
|
13
|
+
end
|
14
|
+
include GaussEstimation, ThresholdByPvalue, CountByThreshold
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PWM
|
2
|
+
module MatrixTransformations
|
3
|
+
def reverse_complement
|
4
|
+
clone_and_transform( matrix.reverse.map(&:reverse) ).refresh_infos
|
5
|
+
end
|
6
|
+
def left_augment(n)
|
7
|
+
clone_and_transform( [[0.0]*4]* n + matrix ).refresh_infos
|
8
|
+
end
|
9
|
+
def right_augment(n)
|
10
|
+
clone_and_transform( matrix + [[0.0]*4]* n ).refresh_infos
|
11
|
+
end
|
12
|
+
def shift_to_zero # make worst score == 0 by shifting scores of each column
|
13
|
+
clone_and_transform( matrix.map{|col| col.map{|letter| letter - col.min}} ).refresh_infos
|
14
|
+
end
|
15
|
+
def discrete(rate)
|
16
|
+
clone_and_transform( matrix.map{|col| col.map{|letter| (letter * rate).ceil}} ).refresh_infos
|
17
|
+
end
|
18
|
+
def split(length_of_first_part)
|
19
|
+
[clone_and_transform( matrix.first(length_of_first_part)).refresh_infos, clone_and_transform(matrix.last(length - length_of_first_part)).refresh_infos]
|
20
|
+
end
|
21
|
+
def permute_columns(permutation_index)
|
22
|
+
clone_and_transform( permutation_index.map{|col| matrix[col]} ).refresh_infos
|
23
|
+
end
|
24
|
+
|
25
|
+
def clone_and_transform(new_matrix)
|
26
|
+
self.dup.instance_eval{ @matrix = new_matrix; self }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module PairMetrics
|
3
|
+
def jaccard(threshold_first, threshold_second)
|
4
|
+
self.map_each_align do |align, alignment_info|
|
5
|
+
align.jaccard(threshold_first, threshold_second).merge(alignment_info)
|
6
|
+
end.max_by {|alignment_info| alignment_info[:similarity]}
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module PairTransformations
|
3
|
+
|
4
|
+
def each
|
5
|
+
second_rc = second.reverse_complement
|
6
|
+
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
7
|
+
first_pwm_alignment = '.' * [-shift, 0].max + '>' * first.length
|
8
|
+
second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * second.length
|
9
|
+
overlap = [first.length + [-shift,0].max, second.length + [shift,0].max].min - shift.abs
|
10
|
+
alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
|
11
|
+
(first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
|
12
|
+
(second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
|
13
|
+
|
14
|
+
yield(PWMCompareAligned.new(first.left_augment([-shift,0].max),
|
15
|
+
(orientation == :direct ? second : second_rc).left_augment([shift,0].max)),
|
16
|
+
text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
|
17
|
+
shift: shift,
|
18
|
+
orientation: orientation,
|
19
|
+
overlap: overlap,
|
20
|
+
alignment_length: alignment_length
|
21
|
+
)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
include Enumerable
|
25
|
+
alias :each_align :each
|
26
|
+
alias :map_each_align :map
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
class PWMCompareAligned
|
3
|
+
attr_reader :first, :second, :length
|
4
|
+
def initialize(first, second)
|
5
|
+
@length = [first.length, second.length].max
|
6
|
+
@first = first.right_augment(@length - first.length)
|
7
|
+
@second = second.right_augment(@length - second.length)
|
8
|
+
end
|
9
|
+
|
10
|
+
include AlignedPairTransformations, AlignedPairMetrics, AlignedPairIntersection
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module PWM
|
2
|
+
class SingleMatrix
|
3
|
+
attr_reader :matrix
|
4
|
+
attr_accessor :name
|
5
|
+
def initialize(matrix)
|
6
|
+
@matrix = matrix
|
7
|
+
end
|
8
|
+
include MatrixTransformations, MatrixInformation
|
9
|
+
|
10
|
+
def self.build_matrix(lines, name = nil)
|
11
|
+
pwm_name = name
|
12
|
+
begin
|
13
|
+
lines.first.split.each{|x| Float(x) }
|
14
|
+
start_line = 0
|
15
|
+
rescue
|
16
|
+
start_line = 1
|
17
|
+
pwm_name = lines.first.chomp.match(/(?:>\s)?(.*)$/)[1]
|
18
|
+
end
|
19
|
+
|
20
|
+
if lines[start_line].split.length == 4
|
21
|
+
pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)})
|
22
|
+
else
|
23
|
+
pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)}.transpose)
|
24
|
+
end
|
25
|
+
raise "PWM::SingleMatrix.build_matrix can't create matrix using this input" unless pwm.matrix.all?{|l| l.length == 4}
|
26
|
+
pwm.name = pwm_name
|
27
|
+
pwm
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.load_from_stdin(input_stream, name = nil)
|
31
|
+
build_matrix(input_stream.readlines, name)
|
32
|
+
end
|
33
|
+
def self.load_from_line_array(lines, name = nil)
|
34
|
+
build_matrix(lines, name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.load_pat(filename)
|
38
|
+
build_matrix( File.open(filename,'r'){|f| f.readlines}, File.basename_wo_ext(filename))
|
39
|
+
end
|
40
|
+
|
41
|
+
def with_background(background)
|
42
|
+
type_cast(MatrixOnBackground){@probabilities = background}.depth_dup
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Kernel
|
2
|
+
def type_cast(new_class, &block)
|
3
|
+
new_obj = new_class.allocate
|
4
|
+
instance_variables.each do |varname|
|
5
|
+
new_obj.instance_variable_set(varname, self.instance_variable_get(varname))
|
6
|
+
end
|
7
|
+
new_obj.instance_eval(&block) if block_given?
|
8
|
+
new_obj
|
9
|
+
end
|
10
|
+
|
11
|
+
def depth_dup
|
12
|
+
begin
|
13
|
+
new_obj = self.dup
|
14
|
+
rescue
|
15
|
+
return self
|
16
|
+
end
|
17
|
+
new_obj.instance_variables.each do |varname|
|
18
|
+
begin
|
19
|
+
new_obj.instance_variable_set(varname, new_obj.instance_variable_get(varname).depth_dup)
|
20
|
+
rescue
|
21
|
+
end
|
22
|
+
end
|
23
|
+
new_obj
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
def File.filename_wo_ext(filename)
|
29
|
+
filename[0..-(1+File.extname(filename).length)]
|
30
|
+
end
|
31
|
+
|
32
|
+
def File.basename_wo_ext(filename)
|
33
|
+
File.basename(filename)[0..-(1+File.extname(filename).length)]
|
34
|
+
end
|