macroape 3.3.7 → 3.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/Rakefile.rb +6 -6
- data/TODO.txt +23 -3
- data/benchmark/similarity_benchmark.rb +18 -18
- data/lib/macroape/aligned_pair_intersection.rb +4 -4
- data/lib/macroape/cli/align_motifs.rb +34 -28
- data/lib/macroape/cli/eval_alignment.rb +73 -47
- data/lib/macroape/cli/eval_similarity.rb +65 -40
- data/lib/macroape/cli/find_pvalue.rb +30 -34
- data/lib/macroape/cli/find_threshold.rb +52 -41
- data/lib/macroape/cli/preprocess_collection.rb +68 -58
- data/lib/macroape/cli/scan_collection.rb +89 -73
- data/lib/macroape/cli.rb +184 -1
- data/lib/macroape/counting.rb +31 -5
- data/lib/macroape/pwm_compare.rb +8 -2
- data/lib/macroape/pwm_compare_aligned.rb +15 -10
- data/lib/macroape/version.rb +2 -1
- data/macroape.gemspec +2 -1
- data/spec/count_distribution_spec.rb +11 -11
- data/test/align_motifs_test.rb +16 -4
- data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
- data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
- data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/KLF4_f2_scan_results_all.txt +1 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
- data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
- data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
- data/test/data/collection_pcm_without_thresholds.yaml +186 -183
- data/test/data/collection_without_thresholds.yaml +186 -183
- data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
- data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
- data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
- data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/test_collection.yaml +179 -176
- data/test/data/test_collection_weak.yaml +214 -0
- data/test/eval_alignment_test.rb +97 -21
- data/test/eval_similarity_test.rb +104 -26
- data/test/find_pvalue_test.rb +22 -9
- data/test/find_threshold_test.rb +76 -25
- data/test/preprocess_collection_test.rb +16 -21
- data/test/scan_collection_test.rb +26 -14
- data/test/test_helper.rb +96 -12
- metadata +44 -24
@@ -3,43 +3,31 @@ require_relative '../../macroape'
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
5
5
|
module FindPValue
|
6
|
-
|
6
|
+
|
7
7
|
def self.main(argv)
|
8
|
-
|
8
|
+
doc = <<-EOS.strip_doc
|
9
9
|
Command-line format:
|
10
|
-
|
11
|
-
or in linux
|
12
|
-
cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
13
|
-
or on windows
|
14
|
-
type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
|
10
|
+
#{run_tool_cmd} <pat-file> <threshold list>... [options]
|
15
11
|
|
16
12
|
Options:
|
17
13
|
[-d <discretization level>]
|
18
|
-
[-
|
19
|
-
|
20
|
-
Output format:
|
21
|
-
threshold_1 count_1 pvalue_1
|
22
|
-
threshold_2 count_2 pvalue_2
|
23
|
-
threshold_3 count_3 pvalue_3
|
24
|
-
The results are printed out in the same order as in the given threshold list.
|
14
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
15
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
25
16
|
|
26
17
|
Examples:
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
35
|
-
STDERR.puts help_string
|
18
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat 7.32
|
19
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
|
20
|
+
EOS
|
21
|
+
|
22
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
23
|
+
STDERR.puts doc
|
36
24
|
exit
|
37
25
|
end
|
38
26
|
|
39
27
|
discretization = 10000
|
40
28
|
background = [1,1,1,1]
|
41
29
|
thresholds = []
|
42
|
-
max_hash_size =
|
30
|
+
max_hash_size = 10000000
|
43
31
|
|
44
32
|
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
45
33
|
filename = argv.shift
|
@@ -53,21 +41,21 @@ module Macroape
|
|
53
41
|
end
|
54
42
|
end
|
55
43
|
|
56
|
-
raise
|
44
|
+
raise 'No input. You should specify input file' unless filename
|
57
45
|
raise 'You should specify at least one threshold' if thresholds.empty?
|
58
46
|
|
59
47
|
until argv.empty?
|
60
48
|
case argv.shift
|
61
49
|
when '-b'
|
62
|
-
background = argv.shift(
|
50
|
+
background = argv.shift.split(',').map(&:to_f)
|
63
51
|
when '-d'
|
64
52
|
discretization = argv.shift.to_f
|
65
|
-
when '-
|
53
|
+
when '--max-hash-size'
|
66
54
|
max_hash_size = argv.shift.to_i
|
67
55
|
end
|
68
56
|
end
|
69
57
|
|
70
|
-
|
58
|
+
|
71
59
|
if filename == '.stdin'
|
72
60
|
input = $stdin.read
|
73
61
|
else
|
@@ -78,14 +66,22 @@ module Macroape
|
|
78
66
|
pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
|
79
67
|
|
80
68
|
counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
69
|
+
infos = []
|
70
|
+
thresholds.each do |threshold|
|
71
|
+
count = counts[threshold * discretization]
|
72
|
+
pvalue = count.to_f / pwm.vocabulary_volume
|
73
|
+
infos << {threshold: threshold,
|
74
|
+
number_of_recognized_words: count,
|
75
|
+
pvalue: pvalue}
|
76
|
+
end
|
77
|
+
|
78
|
+
puts Helper.find_pvalue_info_string( infos,
|
79
|
+
{discretization: discretization,
|
80
|
+
background: background} )
|
85
81
|
rescue => err
|
86
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
82
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
87
83
|
end
|
88
|
-
|
84
|
+
|
89
85
|
end
|
90
86
|
end
|
91
87
|
end
|
@@ -3,65 +3,64 @@ require_relative '../../macroape'
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
5
5
|
module FindThreshold
|
6
|
-
|
6
|
+
|
7
7
|
def self.main(argv)
|
8
|
-
|
9
|
-
Command-line format
|
10
|
-
|
11
|
-
or in linux
|
12
|
-
cat <pat-file> | ruby find_threshold.rb .stdin [options]
|
13
|
-
or on windows
|
14
|
-
type <pat-file> | ruby find_threshold.rb .stdin [options]
|
8
|
+
doc = <<-EOS.strip_doc
|
9
|
+
Command-line format:
|
10
|
+
#{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
|
15
11
|
|
16
12
|
Options:
|
17
|
-
[-p <list of P-values>]
|
18
13
|
[-d <discretization level>]
|
19
|
-
[-
|
20
|
-
|
21
|
-
|
22
|
-
requested_pvalue_1 threshold_1 achieved_pvalue_1
|
23
|
-
requested_pvalue_2 threshold_2 achieved_pvalue_2
|
24
|
-
|
14
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
15
|
+
[--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
|
16
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
25
17
|
|
26
18
|
Example:
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
19
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat
|
20
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
|
21
|
+
EOS
|
22
|
+
|
23
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
24
|
+
STDERR.puts doc
|
32
25
|
exit
|
33
26
|
end
|
34
|
-
|
27
|
+
|
35
28
|
background = [1,1,1,1]
|
36
29
|
default_pvalues = [0.0005]
|
37
30
|
discretization = 10000
|
38
|
-
max_hash_size =
|
39
|
-
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
31
|
+
max_hash_size = 10000000
|
32
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
33
|
+
|
34
|
+
pvalue_boundary = :lower
|
35
|
+
|
40
36
|
|
41
37
|
filename = argv.shift
|
42
|
-
raise
|
38
|
+
raise 'No input. You should specify input file' unless filename
|
43
39
|
|
44
40
|
pvalues = []
|
41
|
+
loop do
|
42
|
+
begin
|
43
|
+
Float(argv.first)
|
44
|
+
pvalues << argv.shift.to_f
|
45
|
+
rescue
|
46
|
+
raise StopIteration
|
47
|
+
end
|
48
|
+
end
|
49
|
+
pvalues = default_pvalues if pvalues.empty?
|
50
|
+
|
45
51
|
until argv.empty?
|
46
52
|
case argv.shift
|
47
53
|
when '-b'
|
48
|
-
background = argv.shift(
|
49
|
-
when '-
|
54
|
+
background = argv.shift.split(',').map(&:to_f)
|
55
|
+
when '--max-hash-size'
|
50
56
|
max_hash_size = argv.shift.to_i
|
51
|
-
when '-p'
|
52
|
-
loop do
|
53
|
-
begin
|
54
|
-
Float(argv.first)
|
55
|
-
pvalues << argv.shift.to_f
|
56
|
-
rescue
|
57
|
-
raise StopIteration
|
58
|
-
end
|
59
|
-
end
|
60
57
|
when '-d'
|
61
58
|
discretization = argv.shift.to_f
|
59
|
+
when '--boundary'
|
60
|
+
pvalue_boundary = argv.shift.to_sym
|
61
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
62
62
|
end
|
63
63
|
end
|
64
|
-
pvalues = default_pvalues if pvalues.empty?
|
65
64
|
|
66
65
|
if filename == '.stdin'
|
67
66
|
input = $stdin.read
|
@@ -72,14 +71,26 @@ module Macroape
|
|
72
71
|
pwm = data_model.new(input).to_pwm
|
73
72
|
pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
|
74
73
|
|
75
|
-
|
76
|
-
|
74
|
+
infos = []
|
75
|
+
collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
|
76
|
+
infos << {expected_pvalue: pvalue,
|
77
|
+
threshold: threshold / discretization,
|
78
|
+
real_pvalue: real_pvalue,
|
79
|
+
recognized_words: pwm.vocabulary_volume * real_pvalue }
|
77
80
|
end
|
78
|
-
|
81
|
+
if pvalue_boundary == :lower
|
82
|
+
pwm.thresholds(*pvalues, &collect_infos_proc)
|
83
|
+
else
|
84
|
+
pwm.weak_thresholds(*pvalues, &collect_infos_proc)
|
85
|
+
end
|
86
|
+
puts Helper.threshold_infos_string(infos,
|
87
|
+
{discretization: discretization,
|
88
|
+
background: background,
|
89
|
+
pvalue_boundary: pvalue_boundary} )
|
79
90
|
rescue => err
|
80
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
91
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
81
92
|
end
|
82
|
-
|
93
|
+
|
83
94
|
end
|
84
95
|
end
|
85
96
|
end
|
@@ -5,87 +5,76 @@ require 'shellwords'
|
|
5
5
|
module Macroape
|
6
6
|
module CLI
|
7
7
|
module PreprocessCollection
|
8
|
-
|
8
|
+
|
9
9
|
def self.main(argv)
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
STDERR.puts help_string
|
10
|
+
doc = <<-EOS.strip_doc
|
11
|
+
Command-line format:
|
12
|
+
#{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
|
13
|
+
|
14
|
+
Options:
|
15
|
+
[-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
|
16
|
+
[-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
|
17
|
+
[--silent] - hide current progress information during scan (printed to stderr by default)
|
18
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
19
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
20
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
21
|
+
|
22
|
+
The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
|
23
|
+
|
24
|
+
Example:
|
25
|
+
#{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
|
26
|
+
EOS
|
27
|
+
|
28
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
29
|
+
STDERR.puts doc
|
31
30
|
exit
|
32
31
|
end
|
33
32
|
|
34
33
|
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
35
|
-
|
34
|
+
|
36
35
|
default_pvalues = [0.0005]
|
37
36
|
background = [1,1,1,1]
|
38
37
|
rough_discretization = 1
|
39
38
|
precise_discretization = 10
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
max_hash_size = 10000000
|
40
|
+
|
43
41
|
data_source = argv.shift
|
44
|
-
|
45
|
-
|
42
|
+
output_file = argv.shift
|
43
|
+
|
44
|
+
raise 'No input. You should specify file or folder with pwms' unless data_source
|
46
45
|
raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
|
46
|
+
raise 'You should specify output file' unless output_file
|
47
47
|
|
48
48
|
pvalues = []
|
49
49
|
silent = false
|
50
|
-
|
50
|
+
pvalue_boundary = :upper
|
51
|
+
|
51
52
|
until argv.empty?
|
52
53
|
case argv.shift
|
53
54
|
when '-b'
|
54
|
-
background = argv.shift(
|
55
|
+
background = argv.shift.split(',').map(&:to_f)
|
55
56
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
|
56
57
|
when '-p'
|
57
|
-
|
58
|
-
begin
|
59
|
-
Float(argv.first)
|
60
|
-
pvalues << argv.shift.to_f
|
61
|
-
rescue
|
62
|
-
raise StopIteration
|
63
|
-
end
|
64
|
-
end
|
58
|
+
pvalues = argv.shift.split(',').map(&:to_f)
|
65
59
|
when '-d'
|
66
|
-
rough_discretization, precise_discretization = argv.shift(
|
67
|
-
when '-
|
68
|
-
output_file = argv.shift
|
69
|
-
output_file_specified = true
|
70
|
-
when '-m'
|
60
|
+
rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
|
61
|
+
when '--max-hash-size'
|
71
62
|
max_hash_size = argv.shift.to_i
|
72
|
-
when '-n'
|
73
|
-
collection_name = argv.shift
|
74
63
|
when '--silent'
|
75
64
|
silent = true
|
65
|
+
when '--boundary'
|
66
|
+
pvalue_boundary = argv.shift.to_sym
|
67
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
76
68
|
end
|
77
69
|
end
|
78
70
|
pvalues = default_pvalues if pvalues.empty?
|
79
71
|
|
80
|
-
collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
|
72
|
+
collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
|
81
73
|
precise_discretization: precise_discretization,
|
82
74
|
background: background,
|
83
75
|
pvalues: pvalues)
|
84
|
-
|
85
|
-
|
86
|
-
output_file = "#{collection_name}.yaml" if !output_file_specified
|
87
|
-
end
|
88
|
-
|
76
|
+
|
77
|
+
data_source = data_source.gsub("\\",'/')
|
89
78
|
if File.directory?(data_source)
|
90
79
|
motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
|
91
80
|
pwm = data_model.new(File.read(filename))
|
@@ -106,12 +95,12 @@ module Macroape
|
|
106
95
|
else
|
107
96
|
raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
|
108
97
|
end
|
109
|
-
|
98
|
+
|
110
99
|
pwms = motifs.map(&:to_pwm)
|
111
|
-
|
100
|
+
|
112
101
|
pwms.each_with_index do |pwm,index|
|
113
|
-
STDERR.puts "#{index
|
114
|
-
|
102
|
+
STDERR.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
|
103
|
+
|
115
104
|
# When support of onefile collections is introduced - then here should be check if name exists.
|
116
105
|
# Otherwise it should skip motif and tell you about this
|
117
106
|
# Also two command line options to fail on skipping or to skip silently should be included
|
@@ -120,7 +109,8 @@ module Macroape
|
|
120
109
|
pwm.set_parameters(background: background, max_hash_size: max_hash_size)
|
121
110
|
skip_motif = false
|
122
111
|
|
123
|
-
|
112
|
+
|
113
|
+
fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
|
124
114
|
if real_pvalue == 0
|
125
115
|
$stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
|
126
116
|
else
|
@@ -128,7 +118,7 @@ module Macroape
|
|
128
118
|
end
|
129
119
|
end
|
130
120
|
|
131
|
-
|
121
|
+
fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
|
132
122
|
if real_pvalue == 0
|
133
123
|
$stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
|
134
124
|
skip_motif = true
|
@@ -136,13 +126,33 @@ module Macroape
|
|
136
126
|
info.precise[pvalue] = threshold / precise_discretization
|
137
127
|
end
|
138
128
|
end
|
129
|
+
|
130
|
+
if pvalue_boundary == :lower
|
131
|
+
pwm.discrete(rough_discretization).thresholds(*pvalues, &fill_rough_infos)
|
132
|
+
else
|
133
|
+
pwm.discrete(rough_discretization).weak_thresholds(*pvalues, &fill_rough_infos)
|
134
|
+
end
|
135
|
+
|
136
|
+
if pvalue_boundary == :lower
|
137
|
+
pwm.discrete(precise_discretization).thresholds(*pvalues, &fill_precise_infos)
|
138
|
+
else
|
139
|
+
pwm.discrete(precise_discretization).weak_thresholds(*pvalues,&fill_precise_infos)
|
140
|
+
end
|
139
141
|
collection.add_pm(pwm, info) unless skip_motif
|
140
142
|
end
|
143
|
+
STDERR.puts "100% complete. Saving results" unless silent
|
141
144
|
File.open(output_file, 'w') do |f|
|
142
145
|
f.puts(collection.to_yaml)
|
143
146
|
end
|
147
|
+
puts OutputInformation.new{|infos|
|
148
|
+
infos.add_parameter('P', 'P-value list', pvalues.join(','))
|
149
|
+
infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
|
150
|
+
infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
|
151
|
+
infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
|
152
|
+
infos.background_parameter('B', 'background', background)
|
153
|
+
}.result
|
144
154
|
rescue => err
|
145
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
155
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
146
156
|
end
|
147
157
|
|
148
158
|
end
|