macroape 3.3.3 → 3.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Rakefile.rb +7 -22
- data/TODO.txt +7 -6
- data/bin/align_motifs +4 -0
- data/bin/eval_alignment +2 -1
- data/bin/eval_similarity +2 -1
- data/bin/find_pvalue +2 -1
- data/bin/find_threshold +2 -1
- data/bin/preprocess_collection +2 -1
- data/bin/scan_collection +2 -1
- data/lib/macroape/aligned_pair_intersection.rb +2 -3
- data/lib/macroape/cli/align_motifs.rb +49 -0
- data/lib/macroape/cli/eval_alignment.rb +124 -0
- data/lib/macroape/cli/eval_similarity.rb +107 -0
- data/lib/macroape/cli/find_pvalue.rb +89 -0
- data/lib/macroape/cli/find_threshold.rb +84 -0
- data/lib/macroape/cli/preprocess_collection.rb +123 -0
- data/lib/macroape/cli/scan_collection.rb +141 -0
- data/lib/macroape/cli.rb +5 -0
- data/lib/macroape/counting.rb +15 -1
- data/lib/macroape/pwm_compare.rb +21 -1
- data/lib/macroape/pwm_compare_aligned.rb +21 -0
- data/lib/macroape/version.rb +1 -1
- data/macroape.gemspec +1 -1
- data/test/align_motifs_test.rb +12 -0
- data/test/data/KLF3_f1.pat +16 -0
- data/test/data/KLF3_f1.pcm +16 -0
- data/test/data/KLF4_f2.pcm +11 -0
- data/test/data/SP1_f1.pat +11 -11
- data/test/data/SP1_f1.pcm +12 -0
- data/test/data/SP1_f1_revcomp.pat +11 -11
- data/test/data/SP1_f1_revcomp.pcm +12 -0
- data/test/data/test_collection/SP1_f1.pat +11 -11
- data/test/data/test_collection.yaml +49 -109
- data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -0
- data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -0
- data/test/data/test_collection_pcm/SP1_f1.pcm +12 -0
- data/test/data/test_collection_single_file.txt +38 -0
- data/test/data/test_collection_single_file_pcm.txt +38 -0
- data/test/eval_alignment_test.rb +31 -0
- data/test/eval_similarity_test.rb +28 -13
- data/test/find_pvalue_test.rb +10 -13
- data/test/find_threshold_test.rb +10 -5
- data/test/preprocess_collection_test.rb +36 -2
- data/test/scan_collection_test.rb +9 -4
- data/test/test_helper.rb +61 -2
- metadata +38 -12
- data/lib/macroape/exec/eval_alignment.rb +0 -125
- data/lib/macroape/exec/eval_similarity.rb +0 -108
- data/lib/macroape/exec/find_pvalue.rb +0 -81
- data/lib/macroape/exec/find_threshold.rb +0 -77
- data/lib/macroape/exec/preprocess_collection.rb +0 -101
- data/lib/macroape/exec/scan_collection.rb +0 -124
- data/test/eval_alignment_similarity_test.rb +0 -20
data/.gitignore
CHANGED
data/Rakefile.rb
CHANGED
@@ -1,34 +1,19 @@
|
|
1
1
|
#!/usr/bin/env rake
|
2
2
|
require "bundler/gem_tasks"
|
3
3
|
require 'rspec/core/rake_task'
|
4
|
+
require 'rake/testtask'
|
4
5
|
|
5
6
|
namespace :spec do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
12
|
-
task :eval_similarity do
|
13
|
-
system("ruby -I ./test test/eval_similarity_test.rb")
|
14
|
-
end
|
15
|
-
task :eval_alignment_similarity do
|
16
|
-
system("ruby -I ./test test/eval_alignment_similarity_test.rb")
|
17
|
-
end
|
18
|
-
task :preprocess_collection do
|
19
|
-
system("ruby -I ./test test/preprocess_collection_test.rb")
|
20
|
-
end
|
21
|
-
task :scan_collection do
|
22
|
-
system("ruby -I ./test test/scan_collection_test.rb")
|
23
|
-
end
|
24
|
-
task :tests => [:find_threshold, :find_pvalue, :eval_similarity,
|
25
|
-
:eval_alignment_similarity, :scan_collection, :preprocess_collection]
|
26
|
-
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.libs << "test"
|
9
|
+
t.test_files = FileList['test/*_test.rb']
|
10
|
+
t.verbose = true
|
11
|
+
end
|
27
12
|
RSpec::Core::RakeTask.new
|
28
13
|
end
|
29
14
|
|
30
15
|
desc 'Test all functionality of gem executables'
|
31
|
-
task :spec => ['spec:
|
16
|
+
task :spec => ['spec:test', 'spec:spec']
|
32
17
|
|
33
18
|
namespace :benchmark do
|
34
19
|
task :run do
|
data/TODO.txt
CHANGED
@@ -1,16 +1,13 @@
|
|
1
1
|
Absolutely necessary:
|
2
|
-
|
3
|
-
Make it available to load collections in preprocess_collection from single file (and from stdin of certainly)
|
4
|
-
Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
|
2
|
+
(already work in preprocess_colleсtion and align_motifs) Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
|
5
3
|
|
6
4
|
Specs and tests:
|
7
5
|
create spec on use of MaxHashSize, MaxHashSizeDouble
|
8
6
|
create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
|
9
|
-
create test for getting PWMs from stdin
|
10
7
|
create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
|
11
8
|
|
12
|
-
Ideas to
|
13
|
-
- Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions
|
9
|
+
Ideas to increase perfomance:
|
10
|
+
- Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions (??! it can significantly obscure code because thresholds will be changed too, and I can't tell what is better: slight perfomance optimization or conciseness of code)
|
14
11
|
- (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
|
15
12
|
- Create JAVA extension for alignment_intersection methods in order to increase perfomance
|
16
13
|
- Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
|
@@ -18,3 +15,7 @@ Ideas to inctrease perfomance:
|
|
18
15
|
Usability issues:
|
19
16
|
review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)
|
20
17
|
|
18
|
+
remove .stdin placeholder. Use tty? method instead
|
19
|
+
|
20
|
+
use OptionParser (??? can OptionParser get stub ARGV ???)
|
21
|
+
make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
|
data/bin/align_motifs
ADDED
data/bin/eval_alignment
CHANGED
data/bin/eval_similarity
CHANGED
data/bin/find_pvalue
CHANGED
data/bin/find_threshold
CHANGED
data/bin/preprocess_collection
CHANGED
data/bin/scan_collection
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
module Macroape
|
2
2
|
class PWMCompareAligned
|
3
|
-
|
4
3
|
# unoptimized version of this and related methods
|
5
4
|
def counts_for_two_matrices(threshold_first, threshold_second)
|
6
5
|
# just not to call method each time
|
@@ -27,11 +26,11 @@ module Macroape
|
|
27
26
|
scores = { 0 => {0 => 1} }
|
28
27
|
length.times do |column|
|
29
28
|
new_scores = recalc_score_hash(scores,
|
30
|
-
|
29
|
+
first.matrix[column], second.matrix[column],
|
31
30
|
threshold_first - first.best_suffix(column + 1),
|
32
31
|
threshold_second - second.best_suffix(column + 1), &count_contribution_block)
|
33
32
|
scores.replace(new_scores)
|
34
|
-
if
|
33
|
+
if max_hash_size && scores.inject(0){|sum,hsh|sum + hsh.size} > max_hash_size
|
35
34
|
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
|
36
35
|
end
|
37
36
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module AlignMotifs
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
help_string = %q{
|
9
|
+
Usage:
|
10
|
+
ruby align_motifs pwm1_file pwm2_file pwm3_file
|
11
|
+
ruby align_motifs pcm1_file pcm2_file pcm3_file --pcm
|
12
|
+
Output:
|
13
|
+
pwm_1_file shift_1 orientation_1
|
14
|
+
pwm_2_file shift_2 orientation_2
|
15
|
+
pwm_3_file shift_3 orientation_3
|
16
|
+
}
|
17
|
+
|
18
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
19
|
+
STDERR.puts help_string
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
25
|
+
leader = argv.shift
|
26
|
+
background = [1,1,1,1]
|
27
|
+
discretization = 10
|
28
|
+
pvalue = 0.0005
|
29
|
+
|
30
|
+
shifts = {leader => [0,:direct]}
|
31
|
+
pwm_first = data_model.new(File.read(leader)).to_pwm.background!(background).discrete!(discretization)
|
32
|
+
argv.each do |motif_name|
|
33
|
+
pwm_second = data_model.new(File.read(motif_name)).to_pwm.background!(background).discrete!(discretization)
|
34
|
+
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
35
|
+
info = cmp.jaccard_by_pvalue(pvalue)
|
36
|
+
shifts[motif_name] = [info[:shift], info[:orientation]]
|
37
|
+
end
|
38
|
+
|
39
|
+
shifts.each do |motif_name, (shift,orientation)|
|
40
|
+
puts "#{motif_name}\t#{shift}\t#{orientation}"
|
41
|
+
end
|
42
|
+
|
43
|
+
rescue => err
|
44
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module EvalAlignment
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
help_string = %q{
|
9
|
+
Command-line format:
|
10
|
+
ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
|
11
|
+
type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
|
12
|
+
or in linux
|
13
|
+
cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
|
14
|
+
|
15
|
+
Options:
|
16
|
+
[-p <P-value>]
|
17
|
+
[-d <discretization level>]
|
18
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
19
|
+
|
20
|
+
Output format:
|
21
|
+
<jaccard similarity coefficient>
|
22
|
+
<number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
|
23
|
+
<aligned 1st matrix>
|
24
|
+
<aligned 2nd matrix>
|
25
|
+
<shift> <orientation>
|
26
|
+
|
27
|
+
Examples:
|
28
|
+
ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
29
|
+
or on windows
|
30
|
+
type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
31
|
+
or in linux
|
32
|
+
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
33
|
+
}
|
34
|
+
|
35
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
36
|
+
STDERR.puts help_string
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
|
40
|
+
pvalue = 0.0005
|
41
|
+
discretization = 10
|
42
|
+
|
43
|
+
first_background = [1,1,1,1]
|
44
|
+
second_background = [1,1,1,1]
|
45
|
+
max_hash_size = 1000000
|
46
|
+
max_pair_hash_size = 1000
|
47
|
+
|
48
|
+
|
49
|
+
first_file = argv.shift
|
50
|
+
second_file = argv.shift
|
51
|
+
|
52
|
+
shift = argv.shift
|
53
|
+
orientation = argv.shift
|
54
|
+
|
55
|
+
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
56
|
+
raise 'You\'d specify shift' unless shift
|
57
|
+
raise 'You\'d specify orientation' unless orientation
|
58
|
+
|
59
|
+
shift = shift.to_i
|
60
|
+
orientation = orientation.to_sym
|
61
|
+
|
62
|
+
case orientation
|
63
|
+
when :direct
|
64
|
+
reverse = false
|
65
|
+
when :revcomp
|
66
|
+
reverse = true
|
67
|
+
else
|
68
|
+
raise 'Unknown orientation(direct/revcomp)'
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
until argv.empty?
|
73
|
+
case argv.shift
|
74
|
+
when '-p'
|
75
|
+
pvalue = argv.shift.to_f
|
76
|
+
when '-d'
|
77
|
+
discretization = argv.shift.to_f
|
78
|
+
when '-m'
|
79
|
+
max_hash_size = argv.shift.to_i
|
80
|
+
when '-md'
|
81
|
+
max_pair_hash_size = argv.shift.to_i
|
82
|
+
when '-b'
|
83
|
+
second_background = first_background = argv.shift(4).map(&:to_f)
|
84
|
+
when '-b1'
|
85
|
+
first_background = argv.shift(4).map(&:to_f)
|
86
|
+
when '-b2'
|
87
|
+
second_background = argv.shift(4).map(&:to_f)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
91
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
92
|
+
|
93
|
+
parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
|
94
|
+
|
95
|
+
if first_file == '.stdin'
|
96
|
+
pwm_first = Bioinform::PWM.new( parser.parse )
|
97
|
+
else
|
98
|
+
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
99
|
+
pwm_first = Bioinform::PWM.new(File.read(first_file))
|
100
|
+
end
|
101
|
+
|
102
|
+
if second_file == '.stdin'
|
103
|
+
pwm_second = Bioinform::PWM.new( parser.parse )
|
104
|
+
else
|
105
|
+
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
106
|
+
pwm_second = Bioinform::PWM.new(File.read(second_file))
|
107
|
+
end
|
108
|
+
|
109
|
+
pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
110
|
+
pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
111
|
+
|
112
|
+
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).max_hash_size(max_pair_hash_size)
|
113
|
+
|
114
|
+
info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
|
115
|
+
|
116
|
+
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
117
|
+
|
118
|
+
rescue => err
|
119
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module EvalSimilarity
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
help_string = %q{
|
9
|
+
Command-line format:
|
10
|
+
ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
|
11
|
+
or on windows
|
12
|
+
type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
13
|
+
or in linux
|
14
|
+
cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
15
|
+
|
16
|
+
Options:
|
17
|
+
[-p <P-value>]
|
18
|
+
[-d <discretization level>]
|
19
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
20
|
+
|
21
|
+
Output has format:
|
22
|
+
<jaccard similarity coefficient>
|
23
|
+
<number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
|
24
|
+
<optimal alignment, the 1st matrix>
|
25
|
+
<optimal alignment, the 2nd matrix>
|
26
|
+
<shift> <orientation>
|
27
|
+
|
28
|
+
Examples:
|
29
|
+
ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
30
|
+
or on windows
|
31
|
+
type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
32
|
+
or in linux
|
33
|
+
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
34
|
+
}
|
35
|
+
|
36
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
37
|
+
STDERR.puts help_string
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
|
41
|
+
pvalue = 0.0005
|
42
|
+
discretization = 10
|
43
|
+
|
44
|
+
first_background = [1,1,1,1]
|
45
|
+
second_background = [1,1,1,1]
|
46
|
+
|
47
|
+
max_hash_size = 1000000
|
48
|
+
max_pair_hash_size = 1000
|
49
|
+
|
50
|
+
|
51
|
+
first_file = argv.shift
|
52
|
+
second_file = argv.shift
|
53
|
+
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
54
|
+
|
55
|
+
until argv.empty?
|
56
|
+
case argv.shift
|
57
|
+
when '-p'
|
58
|
+
pvalue = argv.shift.to_f
|
59
|
+
when '-d'
|
60
|
+
discretization = argv.shift.to_f
|
61
|
+
when '-m'
|
62
|
+
max_hash_size = argv.shift.to_i
|
63
|
+
when '-md'
|
64
|
+
max_pair_hash_size = argv.shift.to_i
|
65
|
+
when '-b'
|
66
|
+
second_background = first_background = argv.shift(4).map(&:to_f)
|
67
|
+
when '-b1'
|
68
|
+
first_background = argv.shift(4).map(&:to_f)
|
69
|
+
when '-b2'
|
70
|
+
second_background = argv.shift(4).map(&:to_f)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
74
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
75
|
+
|
76
|
+
parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
|
77
|
+
|
78
|
+
if first_file == '.stdin'
|
79
|
+
pwm_first = Bioinform::PWM.new( parser.parse )
|
80
|
+
else
|
81
|
+
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
82
|
+
pwm_first = Bioinform::PWM.new(File.read(first_file))
|
83
|
+
end
|
84
|
+
|
85
|
+
if second_file == '.stdin'
|
86
|
+
pwm_second = Bioinform::PWM.new( parser.parse )
|
87
|
+
else
|
88
|
+
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
89
|
+
pwm_second = Bioinform::PWM.new(File.read(second_file))
|
90
|
+
end
|
91
|
+
|
92
|
+
pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
93
|
+
pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
94
|
+
|
95
|
+
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).max_hash_size(max_pair_hash_size)
|
96
|
+
|
97
|
+
info = cmp.jaccard_by_pvalue(pvalue)
|
98
|
+
|
99
|
+
puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
|
100
|
+
|
101
|
+
rescue => err
|
102
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module FindPValue
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
help_string = %q{
|
9
|
+
Command-line format:
|
10
|
+
ruby find_pvalue.rb <pat-file> <threshold list> [options]
|
11
|
+
or in linux
|
12
|
+
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
13
|
+
or on windows
|
14
|
+
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
15
|
+
|
16
|
+
Options:
|
17
|
+
[-d <discretization level>]
|
18
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
19
|
+
|
20
|
+
Output format:
|
21
|
+
threshold_1 count_1 pvalue_1
|
22
|
+
threshold_2 count_2 pvalue_2
|
23
|
+
threshold_3 count_3 pvalue_3
|
24
|
+
The results are printed out in the same order as in the given threshold list.
|
25
|
+
|
26
|
+
Examples:
|
27
|
+
ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
|
28
|
+
or on windows
|
29
|
+
type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
30
|
+
or in linux
|
31
|
+
cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
32
|
+
}
|
33
|
+
|
34
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
35
|
+
STDERR.puts help_string
|
36
|
+
exit
|
37
|
+
end
|
38
|
+
|
39
|
+
discretization = 10000
|
40
|
+
background = [1,1,1,1]
|
41
|
+
thresholds = []
|
42
|
+
max_hash_size = 1000000
|
43
|
+
|
44
|
+
filename = argv.shift
|
45
|
+
|
46
|
+
loop do
|
47
|
+
begin
|
48
|
+
Float(argv.first)
|
49
|
+
thresholds << argv.shift.to_f
|
50
|
+
rescue
|
51
|
+
raise StopIteration
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
56
|
+
raise 'You should specify at least one threshold' if thresholds.empty?
|
57
|
+
|
58
|
+
until argv.empty?
|
59
|
+
case argv.shift
|
60
|
+
when '-b'
|
61
|
+
background = argv.shift(4).map(&:to_f)
|
62
|
+
when '-d'
|
63
|
+
discretization = argv.shift.to_f
|
64
|
+
when '-m'
|
65
|
+
max_hash_size = argv.shift.to_i
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
if filename == '.stdin'
|
71
|
+
pwm = Bioinform::PWM.new( $stdin.read )
|
72
|
+
else
|
73
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
74
|
+
pwm = Bioinform::PWM.new( File.read(filename) )
|
75
|
+
end
|
76
|
+
pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
|
77
|
+
|
78
|
+
counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
79
|
+
pvalues = counts.map{|count| count.to_f / pwm.vocabulary_volume}
|
80
|
+
pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
|
81
|
+
puts "#{threshold}\t#{count}\t#{pvalue}"
|
82
|
+
}
|
83
|
+
rescue => err
|
84
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module FindThreshold
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
help_string = %q{
|
9
|
+
Command-line format::
|
10
|
+
ruby find_threshold.rb <pat-file> [options]
|
11
|
+
or in linux
|
12
|
+
cat <pat-file> | ruby find_threshold.rb .stdin [options]
|
13
|
+
or on windows
|
14
|
+
type <pat-file> | ruby find_threshold.rb .stdin [options]
|
15
|
+
|
16
|
+
Options:
|
17
|
+
[-p <list of P-values>]
|
18
|
+
[-d <discretization level>]
|
19
|
+
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
20
|
+
|
21
|
+
Output format:
|
22
|
+
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
23
|
+
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
24
|
+
|
25
|
+
|
26
|
+
Example:
|
27
|
+
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
28
|
+
}
|
29
|
+
|
30
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
31
|
+
STDERR.puts help_string
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
background = [1,1,1,1]
|
36
|
+
default_pvalues = [0.0005]
|
37
|
+
discretization = 10000
|
38
|
+
max_hash_size = 1000000
|
39
|
+
|
40
|
+
filename = argv.shift
|
41
|
+
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
42
|
+
|
43
|
+
pvalues = []
|
44
|
+
until argv.empty?
|
45
|
+
case argv.shift
|
46
|
+
when '-b'
|
47
|
+
background = argv.shift(4).map(&:to_f)
|
48
|
+
when '-m'
|
49
|
+
max_hash_size = argv.shift.to_i
|
50
|
+
when '-p'
|
51
|
+
loop do
|
52
|
+
begin
|
53
|
+
Float(argv.first)
|
54
|
+
pvalues << argv.shift.to_f
|
55
|
+
rescue
|
56
|
+
raise StopIteration
|
57
|
+
end
|
58
|
+
end
|
59
|
+
when '-d'
|
60
|
+
discretization = argv.shift.to_f
|
61
|
+
end
|
62
|
+
end
|
63
|
+
pvalues = default_pvalues if pvalues.empty?
|
64
|
+
|
65
|
+
if filename == '.stdin'
|
66
|
+
pwm = Bioinform::PWM.new( $stdin.read )
|
67
|
+
else
|
68
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
69
|
+
pwm = Bioinform::PWM.new( File.read(filename) )
|
70
|
+
end
|
71
|
+
|
72
|
+
pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
|
73
|
+
|
74
|
+
pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
75
|
+
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
76
|
+
end
|
77
|
+
|
78
|
+
rescue => err
|
79
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|