macroape 3.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +61 -0
- data/Rakefile +7 -0
- data/bin/eval_alignment +3 -0
- data/bin/eval_similarity +3 -0
- data/bin/find_pvalue +3 -0
- data/bin/find_threshold +3 -0
- data/bin/preprocess_collection +3 -0
- data/bin/scan_collection +3 -0
- data/lib/macroape/aligned_pair_intersection.rb +136 -0
- data/lib/macroape/aligned_pair_metrics.rb +24 -0
- data/lib/macroape/aligned_pair_transformations.rb +23 -0
- data/lib/macroape/collection.rb +15 -0
- data/lib/macroape/count_by_threshold.rb +34 -0
- data/lib/macroape/exec/eval_alignment.rb +141 -0
- data/lib/macroape/exec/eval_similarity.rb +107 -0
- data/lib/macroape/exec/find_pvalue.rb +80 -0
- data/lib/macroape/exec/find_threshold.rb +76 -0
- data/lib/macroape/exec/preprocess_collection.rb +94 -0
- data/lib/macroape/exec/scan_collection.rb +124 -0
- data/lib/macroape/extract_pwm.rb +32 -0
- data/lib/macroape/gauss_estimation.rb +30 -0
- data/lib/macroape/matrix_information.rb +29 -0
- data/lib/macroape/matrix_on_background.rb +16 -0
- data/lib/macroape/matrix_transformations.rb +29 -0
- data/lib/macroape/pair_metrics.rb +9 -0
- data/lib/macroape/pair_transformations.rb +28 -0
- data/lib/macroape/pwm_compare.rb +10 -0
- data/lib/macroape/pwm_compare_aligned.rb +13 -0
- data/lib/macroape/single_matrix.rb +45 -0
- data/lib/macroape/support.rb +34 -0
- data/lib/macroape/threshold_by_pvalue.rb +68 -0
- data/lib/macroape/version.rb +3 -0
- data/lib/macroape.rb +26 -0
- data/macroape.gemspec +17 -0
- data/test/data/AHR_si.pat +10 -0
- data/test/data/KLF4_f2.pat +11 -0
- data/test/data/KLF4_f2_scan_results_all.txt +4 -0
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
- data/test/data/SP1_f1.pat +12 -0
- data/test/data/SP1_f1_revcomp.pat +12 -0
- data/test/data/test_collection/GABPA_f1.pat +14 -0
- data/test/data/test_collection/KLF4_f2.pat +11 -0
- data/test/data/test_collection/SP1_f1.pat +12 -0
- data/test/data/test_collection.yaml +186 -0
- data/test/macroape_test.rb +125 -0
- metadata +116 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby find_pvalue.rb <pat-file> <threshold list> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-d <discretization level>]
|
11
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
12
|
+
|
13
|
+
Output format:
|
14
|
+
threshold_1 count_1 pvalue_1
|
15
|
+
threshold_2 count_2 pvalue_2
|
16
|
+
threshold_3 count_3 pvalue_3
|
17
|
+
The results are printed out in the same order as in the given threshold list.
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
|
21
|
+
or on windows
|
22
|
+
type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
23
|
+
or in linux
|
24
|
+
cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
25
|
+
}
|
26
|
+
|
27
|
+
require 'macroape'
|
28
|
+
|
29
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
30
|
+
STDERR.puts help_string
|
31
|
+
exit
|
32
|
+
end
|
33
|
+
|
34
|
+
discretization = 10000
|
35
|
+
background = [1,1,1,1]
|
36
|
+
thresholds = []
|
37
|
+
begin
|
38
|
+
filename = ARGV.shift
|
39
|
+
|
40
|
+
loop do
|
41
|
+
begin
|
42
|
+
Float(ARGV.first)
|
43
|
+
thresholds << ARGV.shift.to_f
|
44
|
+
rescue
|
45
|
+
raise StopIteration
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
50
|
+
raise 'You should specify at least one threshold' if thresholds.empty?
|
51
|
+
|
52
|
+
until ARGV.empty?
|
53
|
+
case ARGV.shift
|
54
|
+
when '-b'
|
55
|
+
background = ARGV.shift(4).map(&:to_f)
|
56
|
+
when '-d'
|
57
|
+
discretization = ARGV.shift.to_f
|
58
|
+
when '-m'
|
59
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
60
|
+
end
|
61
|
+
end
|
62
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
63
|
+
|
64
|
+
|
65
|
+
if filename == '.stdin'
|
66
|
+
pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
67
|
+
else
|
68
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
69
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
70
|
+
end
|
71
|
+
pwm = pwm.with_background(background)
|
72
|
+
|
73
|
+
counts = pwm.discrete(discretization).counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
74
|
+
pvalues = counts.map{|count| count.to_f / pwm.number_of_words}
|
75
|
+
pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
|
76
|
+
puts "#{threshold}\t#{count}\t#{pvalue}"
|
77
|
+
}
|
78
|
+
rescue => err
|
79
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
80
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format::
|
3
|
+
ruby find_threshold.rb <pat-file> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby find_threshold.rb .stdin [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby find_threshold.rb .stdin [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-p <list of P-values>]
|
11
|
+
[-d <discretization level>]
|
12
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
13
|
+
|
14
|
+
Output format:
|
15
|
+
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
16
|
+
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
17
|
+
|
18
|
+
|
19
|
+
Example:
|
20
|
+
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
21
|
+
}
|
22
|
+
|
23
|
+
require 'macroape'
|
24
|
+
|
25
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
26
|
+
STDERR.puts help_string
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
background = [1,1,1,1]
|
31
|
+
default_pvalues = [0.0005]
|
32
|
+
discretization = 10000
|
33
|
+
|
34
|
+
begin
|
35
|
+
filename = ARGV.shift
|
36
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
37
|
+
|
38
|
+
pvalues = []
|
39
|
+
until ARGV.empty?
|
40
|
+
case ARGV.shift
|
41
|
+
when '-b'
|
42
|
+
background = ARGV.shift(4).map(&:to_f)
|
43
|
+
when '-m'
|
44
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
45
|
+
when '-p'
|
46
|
+
loop do
|
47
|
+
begin
|
48
|
+
Float(ARGV.first)
|
49
|
+
pvalues << ARGV.shift.to_f
|
50
|
+
rescue
|
51
|
+
raise StopIteration
|
52
|
+
end
|
53
|
+
end
|
54
|
+
when '-d'
|
55
|
+
discretization = ARGV.shift.to_f
|
56
|
+
end
|
57
|
+
end
|
58
|
+
pvalues = default_pvalues if pvalues.empty?
|
59
|
+
|
60
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
61
|
+
|
62
|
+
if filename == '.stdin'
|
63
|
+
pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
64
|
+
else
|
65
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
66
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
67
|
+
end
|
68
|
+
|
69
|
+
pwm = pwm.with_background(background)
|
70
|
+
|
71
|
+
pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
72
|
+
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
73
|
+
end
|
74
|
+
rescue => err
|
75
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
76
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby preprocess_collection.rb <folder with PWMs> [options]
|
4
|
+
|
5
|
+
Options:
|
6
|
+
[-p <list of P-values>]
|
7
|
+
[-d <rough discretization> <precise discretization>]
|
8
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
9
|
+
[-o <output file>]
|
10
|
+
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
11
|
+
|
12
|
+
The tool stores preprocessed PWM collection to the specified YAML-file.
|
13
|
+
|
14
|
+
Example:
|
15
|
+
ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
|
16
|
+
}
|
17
|
+
|
18
|
+
require 'macroape'
|
19
|
+
|
20
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
21
|
+
STDERR.puts help_string
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
|
25
|
+
default_pvalues = [0.0005]
|
26
|
+
background = [1,1,1,1]
|
27
|
+
rough_discretization = 1
|
28
|
+
precise_discretization = 10
|
29
|
+
output_file = 'collection.yaml'
|
30
|
+
|
31
|
+
begin
|
32
|
+
folder = ARGV.shift
|
33
|
+
raise "No input. You'd specify folder with pat-files" unless folder
|
34
|
+
raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
|
35
|
+
|
36
|
+
pvalues = []
|
37
|
+
silent = false
|
38
|
+
until ARGV.empty?
|
39
|
+
case ARGV.shift
|
40
|
+
when '-b'
|
41
|
+
background = ARGV.shift(4).map(&:to_f)
|
42
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
|
43
|
+
when '-p'
|
44
|
+
loop do
|
45
|
+
begin
|
46
|
+
Float(ARGV.first)
|
47
|
+
pvalues << ARGV.shift.to_f
|
48
|
+
rescue
|
49
|
+
raise StopIteration
|
50
|
+
end
|
51
|
+
end
|
52
|
+
when '-d'
|
53
|
+
rough_discretization, precise_discretization = ARGV.shift(2).map(&:to_f).sort
|
54
|
+
when '-o'
|
55
|
+
output_file = ARGV.shift
|
56
|
+
when '-m'
|
57
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
58
|
+
when '-md'
|
59
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
60
|
+
when '--silent'
|
61
|
+
silent = true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
pvalues = default_pvalues if pvalues.empty?
|
65
|
+
|
66
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
67
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
68
|
+
|
69
|
+
collection = PWM::Collection.new(rough_discretization, precise_discretization, background, pvalues)
|
70
|
+
|
71
|
+
current_dir = File.dirname(__FILE__)
|
72
|
+
Dir.glob(File.join(folder,'*')) do |filename|
|
73
|
+
STDERR.puts filename unless silent
|
74
|
+
pwm = PWM::SingleMatrix.load_pat(filename)
|
75
|
+
info = {rough: {}, precise: {}}
|
76
|
+
output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{rough_discretization}`.split("\n")
|
77
|
+
output.each do |line|
|
78
|
+
pvalue, threshold, real_pvalue = line.split.map(&:to_f)
|
79
|
+
info[:rough][pvalue] = threshold
|
80
|
+
end
|
81
|
+
|
82
|
+
output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{precise_discretization}`.split("\n")
|
83
|
+
output.each do |line|
|
84
|
+
pvalue, threshold, real_pvalue = line.split.map(&:to_f)
|
85
|
+
info[:precise][pvalue] = threshold
|
86
|
+
end
|
87
|
+
collection.add_pwm(pwm, info)
|
88
|
+
end
|
89
|
+
File.open(output_file,'w') do |f|
|
90
|
+
f.puts(collection.to_yaml)
|
91
|
+
end
|
92
|
+
rescue => err
|
93
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
94
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
help_string = %q{
|
2
|
+
Command-line format:
|
3
|
+
ruby scan_collection.rb <pat-file> <collection> [options]
|
4
|
+
or in linux
|
5
|
+
cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
|
6
|
+
or on windows
|
7
|
+
type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
|
8
|
+
|
9
|
+
Options:
|
10
|
+
[-p <P-value>]
|
11
|
+
[-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
|
12
|
+
[--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
|
13
|
+
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
14
|
+
|
15
|
+
Output format:
|
16
|
+
<name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
|
17
|
+
Attention! Name can contain whitespace characters.
|
18
|
+
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
|
22
|
+
or in linux
|
23
|
+
cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
|
24
|
+
}
|
25
|
+
|
26
|
+
require 'macroape'
|
27
|
+
|
28
|
+
if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
|
29
|
+
STDERR.puts help_string
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
filename = ARGV.shift
|
35
|
+
collection_file = ARGV.shift
|
36
|
+
raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
|
37
|
+
raise "No input. You'd specify input file with collection" unless collection_file
|
38
|
+
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
39
|
+
|
40
|
+
pvalue = 0.0005
|
41
|
+
cutoff = 0.05 # minimal similarity to output
|
42
|
+
collection = YAML.load_file(collection_file)
|
43
|
+
background_query = collection.background
|
44
|
+
|
45
|
+
silent = false
|
46
|
+
precision_mode = :rough
|
47
|
+
until ARGV.empty?
|
48
|
+
case ARGV.shift
|
49
|
+
when '-bq'
|
50
|
+
background_query = ARGV.shift(4).map(&:to_f)
|
51
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
|
52
|
+
when '-p'
|
53
|
+
pvalue = ARGV.shift.to_f
|
54
|
+
when '-m'
|
55
|
+
PWM::MaxHashSize = ARGV.shift.to_f
|
56
|
+
when '-md'
|
57
|
+
PWMCompare::MaxHashSize = ARGV.shift.to_f
|
58
|
+
when '-c'
|
59
|
+
cutoff = ARGV.shift.to_f
|
60
|
+
when '--all'
|
61
|
+
cutoff = 0.0
|
62
|
+
when '--silent'
|
63
|
+
silent = true
|
64
|
+
when '--precise'
|
65
|
+
precision_mode = :precise
|
66
|
+
begin
|
67
|
+
Float(ARGV.first)
|
68
|
+
minimal_similarity = ARGV.shift.to_f
|
69
|
+
rescue
|
70
|
+
minimal_similarity = 0.05
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
|
75
|
+
PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
|
76
|
+
|
77
|
+
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
78
|
+
|
79
|
+
if filename == '.stdin'
|
80
|
+
query_pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
|
81
|
+
else
|
82
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
83
|
+
query_pwm = PWM::SingleMatrix.load_pat(filename)
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
query_pwm_rough = query_pwm.with_background(background_query).discrete(collection.rough_discretization)
|
88
|
+
query_pwm_precise = query_pwm.with_background(background_query).discrete(collection.precise_discretization)
|
89
|
+
|
90
|
+
threshold = query_pwm_rough.threshold(pvalue)
|
91
|
+
threshold_precise = query_pwm_precise.threshold(pvalue)
|
92
|
+
|
93
|
+
similarities = {}
|
94
|
+
precision_file_mode = {}
|
95
|
+
unnamed_index = 0
|
96
|
+
|
97
|
+
collection.pwms.each_key do |name|
|
98
|
+
pwm = collection.pwms[name]
|
99
|
+
pwm_info = collection.infos[name]
|
100
|
+
STDERR.puts pwm.name unless silent
|
101
|
+
cmp = PWMCompare::PWMCompare.new(query_pwm_rough, pwm.with_background(collection.background).discrete(collection.rough_discretization))
|
102
|
+
info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
|
103
|
+
name = pwm.name || "Unnamed #{unnamed_index += 1}"
|
104
|
+
precision_file_mode[name] = :rough
|
105
|
+
|
106
|
+
if precision_mode == :precise and info[:similarity] >= minimal_similarity
|
107
|
+
cmp = PWMCompare::PWMCompare.new(query_pwm_precise, pwm.with_background(collection.background).discrete(collection.precise_discretization))
|
108
|
+
info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
|
109
|
+
precision_file_mode[name] = :precise
|
110
|
+
end
|
111
|
+
similarities[name] = info
|
112
|
+
end
|
113
|
+
|
114
|
+
puts "#pwm\tsimilarity\tshift\toverlap\torientation"
|
115
|
+
similarities.sort_by do |name, info|
|
116
|
+
info[:similarity]
|
117
|
+
end.reverse.each do |name, info|
|
118
|
+
precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
|
119
|
+
puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
|
120
|
+
end
|
121
|
+
|
122
|
+
rescue => err
|
123
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
124
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# r_stream, w_stream - supposed to be a pipe. Data's read from r_stream, pwm's extracted, remaining data pushed back to w_stream
|
2
|
+
# ... --> w_stream --> r_stream --> data
|
3
|
+
# ^ |
|
4
|
+
# | v
|
5
|
+
# ... <-- w_stream <-- ... --> extracted pwm
|
6
|
+
def extract_pwm(r_stream, w_stream)
|
7
|
+
lines = r_stream.readlines
|
8
|
+
return [r_stream, w_stream, nil] if lines.empty?
|
9
|
+
|
10
|
+
extracted_pwm = [lines.shift]
|
11
|
+
while extracted_pwm.last.chomp == ''
|
12
|
+
extracted_pwm = [lines.shift.strip]
|
13
|
+
return [r_stream, w_stream, nil] unless extracted_pwm.last
|
14
|
+
end
|
15
|
+
|
16
|
+
r_stream.close
|
17
|
+
begin
|
18
|
+
until lines.empty?
|
19
|
+
line = lines.shift
|
20
|
+
line.split.each{|x| Float(x) } # raises error if string is not a numeric
|
21
|
+
raise 'Not a PWM string (too little number of numbers - may be empty string or name of next pwm). PWM finished' if line.split.size < 4
|
22
|
+
extracted_pwm << line
|
23
|
+
end
|
24
|
+
rescue
|
25
|
+
lines.unshift(line)
|
26
|
+
end
|
27
|
+
new_r_stream, new_w_stream = IO.pipe
|
28
|
+
lines.each{|one_line| new_w_stream.write(one_line)}
|
29
|
+
new_w_stream.close
|
30
|
+
|
31
|
+
[new_r_stream, new_w_stream, extracted_pwm]
|
32
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module PWM
|
2
|
+
module GaussEstimation
|
3
|
+
def score_mean
|
4
|
+
bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
|
5
|
+
matrix.inject(0.0){ |mean, col| mean + 4.times.inject(0.0){|sum,letter| sum + col[letter] * bckgr[letter]} }
|
6
|
+
end
|
7
|
+
def score_variance
|
8
|
+
bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
|
9
|
+
matrix.inject(0.0) do |variance, col|
|
10
|
+
variance + 4.times.inject(0.0) { |sum,letter| sum + col[letter]**2 * bckgr[letter] } -
|
11
|
+
4.times.inject(0.0) { |sum,letter| sum + col[letter] * bckgr[letter] }**2
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def threshold_gauss_estimation(pvalue)
|
15
|
+
sigma = Math.sqrt(score_variance)
|
16
|
+
n_ = inverf2(1 - 2 * pvalue) * Math.sqrt(2)
|
17
|
+
score_mean + n_ * sigma
|
18
|
+
end
|
19
|
+
def inverf2(x)
|
20
|
+
sign = x < 0 ? -1 : 1
|
21
|
+
x = x.abs
|
22
|
+
a = 8 / (3*Math::PI) * (Math::PI-3) / (4-Math::PI)
|
23
|
+
part0 = ( 2/(Math::PI*a) + (Math.log(1-x*x)) / 2 )**2
|
24
|
+
part = -2 / (Math::PI * a) - Math.log(1-x*x)/2 + Math.sqrt(-1/a *
|
25
|
+
Math.log(1-x*x) + part0)
|
26
|
+
sign * Math.sqrt(part)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PWM
|
2
|
+
module MatrixInformation
|
3
|
+
def length
|
4
|
+
@length ||= matrix.length
|
5
|
+
end
|
6
|
+
def best_score
|
7
|
+
@best_score ||= matrix.inject(0){|sum, col| sum + col.max}
|
8
|
+
end
|
9
|
+
def worst_score
|
10
|
+
@worst_score ||= matrix.inject(0){|sum, col| sum + col.min}
|
11
|
+
end
|
12
|
+
def best_suffix
|
13
|
+
return @best_suffix if @best_suffix
|
14
|
+
@best_suffix = Array.new(length + 1, 0) # best score of suffix s[i..l]
|
15
|
+
length.times{|i| @best_suffix[length - i - 1] = matrix[length - i - 1].max + @best_suffix[length - i] }
|
16
|
+
@best_suffix
|
17
|
+
end
|
18
|
+
def worst_suffix
|
19
|
+
return @worst_suffix if @worst_suffix
|
20
|
+
@worst_suffix = Array.new(length + 1, 0)
|
21
|
+
length.times{|i| @worst_suffix[length - i - 1] = matrix[length - i - 1].min + @worst_suffix[length - i] }
|
22
|
+
@worst_suffix
|
23
|
+
end
|
24
|
+
def refresh_infos
|
25
|
+
@length = @best_score = @worst_score = @best_suffix = @worst_suffix = nil
|
26
|
+
self
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module PWM
|
2
|
+
class MatrixOnBackground < SingleMatrix
|
3
|
+
attr_reader :probabilities
|
4
|
+
def initialize(matrix,background)
|
5
|
+
super(matrix)
|
6
|
+
@probabilities = background
|
7
|
+
end
|
8
|
+
def sum_of_probabilities
|
9
|
+
@sum_of_probabilities ||= probabilities.inject(0.0, &:+)
|
10
|
+
end
|
11
|
+
def number_of_words
|
12
|
+
sum_of_probabilities ** length
|
13
|
+
end
|
14
|
+
include GaussEstimation, ThresholdByPvalue, CountByThreshold
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PWM
|
2
|
+
module MatrixTransformations
|
3
|
+
def reverse_complement
|
4
|
+
clone_and_transform( matrix.reverse.map(&:reverse) ).refresh_infos
|
5
|
+
end
|
6
|
+
def left_augment(n)
|
7
|
+
clone_and_transform( [[0.0]*4]* n + matrix ).refresh_infos
|
8
|
+
end
|
9
|
+
def right_augment(n)
|
10
|
+
clone_and_transform( matrix + [[0.0]*4]* n ).refresh_infos
|
11
|
+
end
|
12
|
+
def shift_to_zero # make worst score == 0 by shifting scores of each column
|
13
|
+
clone_and_transform( matrix.map{|col| col.map{|letter| letter - col.min}} ).refresh_infos
|
14
|
+
end
|
15
|
+
def discrete(rate)
|
16
|
+
clone_and_transform( matrix.map{|col| col.map{|letter| (letter * rate).ceil}} ).refresh_infos
|
17
|
+
end
|
18
|
+
def split(length_of_first_part)
|
19
|
+
[clone_and_transform( matrix.first(length_of_first_part)).refresh_infos, clone_and_transform(matrix.last(length - length_of_first_part)).refresh_infos]
|
20
|
+
end
|
21
|
+
def permute_columns(permutation_index)
|
22
|
+
clone_and_transform( permutation_index.map{|col| matrix[col]} ).refresh_infos
|
23
|
+
end
|
24
|
+
|
25
|
+
def clone_and_transform(new_matrix)
|
26
|
+
self.dup.instance_eval{ @matrix = new_matrix; self }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module PairMetrics
|
3
|
+
def jaccard(threshold_first, threshold_second)
|
4
|
+
self.map_each_align do |align, alignment_info|
|
5
|
+
align.jaccard(threshold_first, threshold_second).merge(alignment_info)
|
6
|
+
end.max_by {|alignment_info| alignment_info[:similarity]}
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
module PairTransformations
|
3
|
+
|
4
|
+
def each
|
5
|
+
second_rc = second.reverse_complement
|
6
|
+
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
7
|
+
first_pwm_alignment = '.' * [-shift, 0].max + '>' * first.length
|
8
|
+
second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * second.length
|
9
|
+
overlap = [first.length + [-shift,0].max, second.length + [shift,0].max].min - shift.abs
|
10
|
+
alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
|
11
|
+
(first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
|
12
|
+
(second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
|
13
|
+
|
14
|
+
yield(PWMCompareAligned.new(first.left_augment([-shift,0].max),
|
15
|
+
(orientation == :direct ? second : second_rc).left_augment([shift,0].max)),
|
16
|
+
text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
|
17
|
+
shift: shift,
|
18
|
+
orientation: orientation,
|
19
|
+
overlap: overlap,
|
20
|
+
alignment_length: alignment_length
|
21
|
+
)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
include Enumerable
|
25
|
+
alias :each_align :each
|
26
|
+
alias :map_each_align :map
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module PWMCompare
|
2
|
+
class PWMCompareAligned
|
3
|
+
attr_reader :first, :second, :length
|
4
|
+
def initialize(first, second)
|
5
|
+
@length = [first.length, second.length].max
|
6
|
+
@first = first.right_augment(@length - first.length)
|
7
|
+
@second = second.right_augment(@length - second.length)
|
8
|
+
end
|
9
|
+
|
10
|
+
include AlignedPairTransformations, AlignedPairMetrics, AlignedPairIntersection
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module PWM
|
2
|
+
class SingleMatrix
|
3
|
+
attr_reader :matrix
|
4
|
+
attr_accessor :name
|
5
|
+
def initialize(matrix)
|
6
|
+
@matrix = matrix
|
7
|
+
end
|
8
|
+
include MatrixTransformations, MatrixInformation
|
9
|
+
|
10
|
+
def self.build_matrix(lines, name = nil)
|
11
|
+
pwm_name = name
|
12
|
+
begin
|
13
|
+
lines.first.split.each{|x| Float(x) }
|
14
|
+
start_line = 0
|
15
|
+
rescue
|
16
|
+
start_line = 1
|
17
|
+
pwm_name = lines.first.chomp.match(/(?:>\s)?(.*)$/)[1]
|
18
|
+
end
|
19
|
+
|
20
|
+
if lines[start_line].split.length == 4
|
21
|
+
pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)})
|
22
|
+
else
|
23
|
+
pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)}.transpose)
|
24
|
+
end
|
25
|
+
raise "PWM::SingleMatrix.build_matrix can't create matrix using this input" unless pwm.matrix.all?{|l| l.length == 4}
|
26
|
+
pwm.name = pwm_name
|
27
|
+
pwm
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.load_from_stdin(input_stream, name = nil)
|
31
|
+
build_matrix(input_stream.readlines, name)
|
32
|
+
end
|
33
|
+
def self.load_from_line_array(lines, name = nil)
|
34
|
+
build_matrix(lines, name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.load_pat(filename)
|
38
|
+
build_matrix( File.open(filename,'r'){|f| f.readlines}, File.basename_wo_ext(filename))
|
39
|
+
end
|
40
|
+
|
41
|
+
def with_background(background)
|
42
|
+
type_cast(MatrixOnBackground){@probabilities = background}.depth_dup
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Kernel
|
2
|
+
def type_cast(new_class, &block)
|
3
|
+
new_obj = new_class.allocate
|
4
|
+
instance_variables.each do |varname|
|
5
|
+
new_obj.instance_variable_set(varname, self.instance_variable_get(varname))
|
6
|
+
end
|
7
|
+
new_obj.instance_eval(&block) if block_given?
|
8
|
+
new_obj
|
9
|
+
end
|
10
|
+
|
11
|
+
def depth_dup
|
12
|
+
begin
|
13
|
+
new_obj = self.dup
|
14
|
+
rescue
|
15
|
+
return self
|
16
|
+
end
|
17
|
+
new_obj.instance_variables.each do |varname|
|
18
|
+
begin
|
19
|
+
new_obj.instance_variable_set(varname, new_obj.instance_variable_get(varname).depth_dup)
|
20
|
+
rescue
|
21
|
+
end
|
22
|
+
end
|
23
|
+
new_obj
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
def File.filename_wo_ext(filename)
|
29
|
+
filename[0..-(1+File.extname(filename).length)]
|
30
|
+
end
|
31
|
+
|
32
|
+
def File.basename_wo_ext(filename)
|
33
|
+
File.basename(filename)[0..-(1+File.extname(filename).length)]
|
34
|
+
end
|