macroape 3.3.6 → 3.3.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -18
- data/Gemfile +4 -4
- data/LICENSE +21 -21
- data/README.md +69 -67
- data/TODO.txt +1 -1
- data/benchmark/benchmark_helper.rb +5 -0
- data/benchmark/similarity_benchmark.rb +1 -4
- data/bin/align_motifs +1 -1
- data/bin/eval_alignment +1 -1
- data/bin/eval_similarity +1 -1
- data/bin/find_pvalue +1 -1
- data/bin/find_threshold +1 -1
- data/bin/preprocess_collection +1 -1
- data/bin/scan_collection +1 -1
- data/lib/macroape/aligned_pair_intersection.rb +1 -1
- data/lib/macroape/cli/align_motifs.rb +4 -4
- data/lib/macroape/cli/eval_alignment.rb +9 -6
- data/lib/macroape/cli/eval_similarity.rb +9 -6
- data/lib/macroape/cli/find_pvalue.rb +3 -3
- data/lib/macroape/cli/find_threshold.rb +3 -3
- data/lib/macroape/cli/preprocess_collection.rb +37 -13
- data/lib/macroape/cli/scan_collection.rb +28 -21
- data/lib/macroape/counting.rb +7 -13
- data/lib/macroape/pwm_compare.rb +8 -15
- data/lib/macroape/pwm_compare_aligned.rb +11 -38
- data/lib/macroape/version.rb +1 -1
- data/lib/macroape.rb +6 -7
- data/macroape.gemspec +1 -1
- data/spec/count_distribution_spec.rb +2 -3
- data/spec/spec_helper.rb +2 -2
- data/test/align_motifs_test.rb +1 -1
- data/test/data/KLF4_f2_scan_results_all.txt +4 -4
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -4
- data/test/data/collection_pcm_without_thresholds.yaml +185 -0
- data/test/data/collection_without_thresholds.yaml +185 -0
- data/test/data/medium_motif.pat +8 -0
- data/test/data/short_motif.pat +7 -0
- data/test/data/test_collection.yaml +27 -18
- data/test/eval_alignment_test.rb +1 -1
- data/test/eval_similarity_test.rb +1 -1
- data/test/find_pvalue_test.rb +1 -1
- data/test/find_threshold_test.rb +1 -1
- data/test/preprocess_collection_test.rb +59 -37
- data/test/scan_collection_test.rb +10 -1
- data/test/test_helper.rb +13 -9
- metadata +15 -6
data/.gitignore
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
*.gem
|
2
|
-
*.rbc
|
3
|
-
.bundle
|
4
|
-
.config
|
5
|
-
.yardoc
|
6
|
-
Gemfile.lock
|
7
|
-
InstalledFiles
|
8
|
-
_yardoc
|
9
|
-
coverage
|
10
|
-
doc/
|
11
|
-
lib/bundler/man
|
12
|
-
pkg
|
13
|
-
rdoc
|
14
|
-
spec/reports
|
15
|
-
test/tmp
|
16
|
-
test/version_tmp
|
17
|
-
tmp
|
18
|
-
benchmark
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
benchmark/*.log
|
data/Gemfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
# Specify your gem's dependencies in macroape.gemspec
|
4
|
-
gemspec
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in macroape.gemspec
|
4
|
+
gemspec
|
data/LICENSE
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
|
2
|
-
|
3
|
-
MIT License
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
-
a copy of this software and associated documentation files (the
|
7
|
-
"Software"), to deal in the Software without restriction, including
|
8
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
the following conditions:
|
12
|
-
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
included in all copies or substantial portions of the Software.
|
15
|
-
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
1
|
+
Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
22
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,68 +1,70 @@
|
|
1
|
-
# Macroape
|
2
|
-
|
3
|
-
Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
|
4
|
-
|
5
|
-
## Installation
|
6
|
-
|
7
|
-
Add this line to your application's Gemfile:
|
8
|
-
|
9
|
-
gem 'macroape'
|
10
|
-
|
11
|
-
And then execute:
|
12
|
-
|
13
|
-
$ bundle
|
14
|
-
|
15
|
-
Or install it yourself as:
|
16
|
-
|
17
|
-
$ gem install macroape
|
18
|
-
|
19
|
-
## Usage
|
20
|
-
For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
|
21
|
-
|
22
|
-
## Basic usage as a command-line tool
|
23
|
-
MacroAPE have 7 command line tools:
|
24
|
-
|
25
|
-
### Tools for calculating thresholds and pvalues:
|
26
|
-
* find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
|
27
|
-
* find_pvalue \<PWM file\> \<threshold\>
|
28
|
-
|
29
|
-
### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
|
30
|
-
* eval_similarity \<first PWM file\> \<second PWM file\>
|
31
|
-
* eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
|
32
|
-
|
33
|
-
### Tools for looking through collection for the motifs most similar to a query motif
|
34
|
-
* preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
|
35
|
-
* scan_collection \<query PWM file\> \<collection file\>
|
36
|
-
|
37
|
-
### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
|
38
|
-
* align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
|
39
|
-
|
40
|
-
Also you can use -h option to print help for a tool in console.
|
41
|
-
There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
|
42
|
-
Some of tools also can process PCMs in addition to PWMs.
|
43
|
-
|
44
|
-
## Basic usage in your code
|
45
|
-
require 'macroape'
|
46
|
-
background = [1,1,1,1]
|
47
|
-
discretization = 10
|
48
|
-
pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
|
49
|
-
pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
|
50
|
-
|
51
|
-
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
52
|
-
first_threshold = pwm_first.threshold(pvalue)
|
53
|
-
second_threshold = pwm_second.threshold(pvalue)
|
54
|
-
|
55
|
-
similarity_info = cmp.jaccard(first_threshold, second_threshold)
|
56
|
-
puts "Jaccard similarity: #{similarity_info[:similarity]}"
|
57
|
-
|
58
|
-
For more details look a source code of utilities in lib/macroape/cli/ folder
|
59
|
-
|
60
|
-
## Contributing
|
61
|
-
|
62
|
-
1. Fork it
|
63
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
64
|
-
3. Commit your changes (`git commit -am 'Added some feature'`)
|
65
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
66
|
-
5. Create new Pull Request
|
67
|
-
|
1
|
+
# Macroape
|
2
|
+
|
3
|
+
Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'macroape'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install macroape
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
|
21
|
+
|
22
|
+
## Basic usage as a command-line tool
|
23
|
+
MacroAPE have 7 command line tools:
|
24
|
+
|
25
|
+
### Tools for calculating thresholds and pvalues:
|
26
|
+
* find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
|
27
|
+
* find_pvalue \<PWM file\> \<threshold\>
|
28
|
+
|
29
|
+
### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
|
30
|
+
* eval_similarity \<first PWM file\> \<second PWM file\>
|
31
|
+
* eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
|
32
|
+
|
33
|
+
### Tools for looking through collection for the motifs most similar to a query motif
|
34
|
+
* preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
|
35
|
+
* scan_collection \<query PWM file\> \<collection file\>
|
36
|
+
|
37
|
+
### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
|
38
|
+
* align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
|
39
|
+
|
40
|
+
Also you can use -h option to print help for a tool in console.
|
41
|
+
There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
|
42
|
+
Some of tools also can process PCMs in addition to PWMs.
|
43
|
+
|
44
|
+
## Basic usage in your code
|
45
|
+
require 'macroape'
|
46
|
+
background = [1,1,1,1]
|
47
|
+
discretization = 10
|
48
|
+
pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
|
49
|
+
pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
|
50
|
+
|
51
|
+
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
52
|
+
first_threshold = pwm_first.threshold(pvalue)
|
53
|
+
second_threshold = pwm_second.threshold(pvalue)
|
54
|
+
|
55
|
+
similarity_info = cmp.jaccard(first_threshold, second_threshold)
|
56
|
+
puts "Jaccard similarity: #{similarity_info[:similarity]}"
|
57
|
+
|
58
|
+
For more details look a source code of utilities in lib/macroape/cli/ folder
|
59
|
+
|
60
|
+
## Contributing
|
61
|
+
|
62
|
+
1. Fork it
|
63
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
64
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
65
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
66
|
+
5. Create new Pull Request
|
67
|
+
|
68
|
+
If you're developing both macroape and bioinform - it may be useful to know that test_helper and spec_helper expands require path in such a way that if you have two "cousin" folders: macroape and bioinform then macroape specs will require bioinform from development folder not from gem. It can save you lots of time not to rebuild-reinstall bioinform gem each time it get some changes
|
69
|
+
|
68
70
|
Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
|
data/TODO.txt
CHANGED
@@ -10,7 +10,7 @@ Ideas to increase perfomance:
|
|
10
10
|
- Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
|
11
11
|
|
12
12
|
Usability issues:
|
13
|
-
|
13
|
+
make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
|
14
14
|
|
15
15
|
remove .stdin placeholder. Use tty? method instead
|
16
16
|
|
data/bin/align_motifs
CHANGED
data/bin/eval_alignment
CHANGED
data/bin/eval_similarity
CHANGED
data/bin/find_pvalue
CHANGED
data/bin/find_threshold
CHANGED
data/bin/preprocess_collection
CHANGED
data/bin/scan_collection
CHANGED
@@ -30,7 +30,7 @@ module Macroape
|
|
30
30
|
threshold_first - first.best_suffix(column + 1),
|
31
31
|
threshold_second - second.best_suffix(column + 1), &count_contribution_block)
|
32
32
|
scores.replace(new_scores)
|
33
|
-
if
|
33
|
+
if max_pair_hash_size && scores.inject(0){|sum,hsh|sum + hsh.size} > max_pair_hash_size
|
34
34
|
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
|
35
35
|
end
|
36
36
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
@@ -15,7 +15,7 @@ module Macroape
|
|
15
15
|
pwm_3_file shift_3 orientation_3
|
16
16
|
}
|
17
17
|
|
18
|
-
if
|
18
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
19
19
|
STDERR.puts help_string
|
20
20
|
exit
|
21
21
|
end
|
@@ -28,9 +28,9 @@ module Macroape
|
|
28
28
|
pvalue = 0.0005
|
29
29
|
|
30
30
|
shifts = {leader => [0,:direct]}
|
31
|
-
pwm_first = data_model.new(File.read(leader)).to_pwm.
|
31
|
+
pwm_first = data_model.new(File.read(leader)).to_pwm.set_parameters(background: background).discrete!(discretization)
|
32
32
|
argv.each do |motif_name|
|
33
|
-
pwm_second = data_model.new(File.read(motif_name)).to_pwm.
|
33
|
+
pwm_second = data_model.new(File.read(motif_name)).to_pwm.set_parameters(background: background).discrete!(discretization)
|
34
34
|
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
|
35
35
|
info = cmp.jaccard_by_pvalue(pvalue)
|
36
36
|
shifts[motif_name] = [info[:shift], info[:orientation]]
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
@@ -32,7 +32,7 @@ module Macroape
|
|
32
32
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
33
33
|
}
|
34
34
|
|
35
|
-
if
|
35
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
36
36
|
STDERR.puts help_string
|
37
37
|
exit
|
38
38
|
end
|
@@ -91,7 +91,10 @@ module Macroape
|
|
91
91
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
92
92
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
93
93
|
|
94
|
-
|
94
|
+
if first_file == '.stdin' || second_file == '.stdin'
|
95
|
+
input = $stdin.read
|
96
|
+
parser = data_model.choose_parser(input).new(input)
|
97
|
+
end
|
95
98
|
|
96
99
|
if first_file == '.stdin'
|
97
100
|
input_first = parser.parse
|
@@ -109,10 +112,10 @@ module Macroape
|
|
109
112
|
end
|
110
113
|
pwm_second = data_model.new(input_second).to_pwm
|
111
114
|
|
112
|
-
pwm_first.background
|
113
|
-
pwm_second.background
|
115
|
+
pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
|
116
|
+
pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
|
114
117
|
|
115
|
-
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).
|
118
|
+
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
|
116
119
|
|
117
120
|
info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
|
118
121
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
@@ -33,7 +33,7 @@ module Macroape
|
|
33
33
|
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
34
34
|
}
|
35
35
|
|
36
|
-
if
|
36
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
37
37
|
STDERR.puts help_string
|
38
38
|
exit
|
39
39
|
end
|
@@ -73,7 +73,10 @@ module Macroape
|
|
73
73
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
74
74
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
|
75
75
|
|
76
|
-
|
76
|
+
if first_file == '.stdin' || second_file == '.stdin'
|
77
|
+
input = $stdin.read
|
78
|
+
parser = data_model.choose_parser(input).new(input)
|
79
|
+
end
|
77
80
|
|
78
81
|
if first_file == '.stdin'
|
79
82
|
input_first = parser.parse
|
@@ -91,10 +94,10 @@ module Macroape
|
|
91
94
|
end
|
92
95
|
pwm_second = data_model.new(input_second).to_pwm
|
93
96
|
|
94
|
-
pwm_first.background
|
95
|
-
pwm_second.background
|
97
|
+
pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
|
98
|
+
pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
|
96
99
|
|
97
|
-
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).
|
100
|
+
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
|
98
101
|
|
99
102
|
info = cmp.jaccard_by_pvalue(pvalue)
|
100
103
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
@@ -31,7 +31,7 @@ module Macroape
|
|
31
31
|
cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
|
32
32
|
}
|
33
33
|
|
34
|
-
if
|
34
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
35
35
|
STDERR.puts help_string
|
36
36
|
exit
|
37
37
|
end
|
@@ -75,7 +75,7 @@ module Macroape
|
|
75
75
|
input = File.read(filename)
|
76
76
|
end
|
77
77
|
pwm = data_model.new(input).to_pwm
|
78
|
-
pwm.
|
78
|
+
pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
|
79
79
|
|
80
80
|
counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
81
81
|
pvalues = counts.map{|count| count.to_f / pwm.vocabulary_volume}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
@@ -27,7 +27,7 @@ module Macroape
|
|
27
27
|
ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
|
28
28
|
}
|
29
29
|
|
30
|
-
if
|
30
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
31
31
|
STDERR.puts help_string
|
32
32
|
exit
|
33
33
|
end
|
@@ -70,7 +70,7 @@ module Macroape
|
|
70
70
|
input = File.read(filename)
|
71
71
|
end
|
72
72
|
pwm = data_model.new(input).to_pwm
|
73
|
-
pwm.
|
73
|
+
pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
|
74
74
|
|
75
75
|
pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
76
76
|
puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
|
@@ -1,5 +1,6 @@
|
|
1
|
-
|
1
|
+
require_relative '../../macroape'
|
2
2
|
require 'yaml'
|
3
|
+
require 'shellwords'
|
3
4
|
|
4
5
|
module Macroape
|
5
6
|
module CLI
|
@@ -8,13 +9,14 @@ module Macroape
|
|
8
9
|
def self.main(argv)
|
9
10
|
help_string = %q{
|
10
11
|
Command-line format:
|
11
|
-
ruby preprocess_collection.rb <file or folder with PWMs or .stdin with
|
12
|
+
ruby preprocess_collection.rb <file or folder with PWMs or .stdin with filenames> [options]
|
12
13
|
|
13
14
|
Options:
|
14
15
|
[-p <list of P-values>]
|
15
16
|
[-d <rough discretization> <precise discretization>]
|
16
17
|
[-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
|
17
18
|
[-o <output file>]
|
19
|
+
[-n <name>] - specify name for a collection. Default filename is based on this parameter
|
18
20
|
[--silent] - don't show current progress information during scan (by default this information's written into stderr)
|
19
21
|
[--pcm] - treats your input motifs as PCM-s. Motifs are converted to PWMs internally so output is the same as for according PWMs
|
20
22
|
|
@@ -24,7 +26,7 @@ module Macroape
|
|
24
26
|
ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
|
25
27
|
}
|
26
28
|
|
27
|
-
if
|
29
|
+
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
28
30
|
STDERR.puts help_string
|
29
31
|
exit
|
30
32
|
end
|
@@ -35,7 +37,7 @@ module Macroape
|
|
35
37
|
background = [1,1,1,1]
|
36
38
|
rough_discretization = 1
|
37
39
|
precise_discretization = 10
|
38
|
-
output_file =
|
40
|
+
output_file = 'collection.yaml'
|
39
41
|
max_hash_size = 1000000
|
40
42
|
|
41
43
|
data_source = argv.shift
|
@@ -45,6 +47,7 @@ module Macroape
|
|
45
47
|
|
46
48
|
pvalues = []
|
47
49
|
silent = false
|
50
|
+
output_file_specified = false
|
48
51
|
until argv.empty?
|
49
52
|
case argv.shift
|
50
53
|
when '-b'
|
@@ -63,8 +66,11 @@ module Macroape
|
|
63
66
|
rough_discretization, precise_discretization = argv.shift(2).map(&:to_f).sort
|
64
67
|
when '-o'
|
65
68
|
output_file = argv.shift
|
69
|
+
output_file_specified = true
|
66
70
|
when '-m'
|
67
71
|
max_hash_size = argv.shift.to_i
|
72
|
+
when '-n'
|
73
|
+
collection_name = argv.shift
|
68
74
|
when '--silent'
|
69
75
|
silent = true
|
70
76
|
end
|
@@ -75,6 +81,10 @@ module Macroape
|
|
75
81
|
precise_discretization: precise_discretization,
|
76
82
|
background: background,
|
77
83
|
pvalues: pvalues)
|
84
|
+
if collection_name
|
85
|
+
collection.name = collection_name
|
86
|
+
output_file = "#{collection_name}.yaml" if !output_file_specified
|
87
|
+
end
|
78
88
|
|
79
89
|
if File.directory?(data_source)
|
80
90
|
motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
|
@@ -84,10 +94,15 @@ module Macroape
|
|
84
94
|
end
|
85
95
|
elsif File.file?(data_source)
|
86
96
|
input = File.read(data_source)
|
87
|
-
motifs = data_model.
|
97
|
+
motifs = data_model.split_on_motifs(input)
|
88
98
|
elsif data_source == '.stdin'
|
89
|
-
|
90
|
-
motifs =
|
99
|
+
filelist = $stdin.read.shellsplit
|
100
|
+
motifs = []
|
101
|
+
filelist.each do |filename|
|
102
|
+
motif = data_model.new(File.read(filename))
|
103
|
+
motif.name ||= File.basename(filename, File.extname(filename))
|
104
|
+
motifs << motif
|
105
|
+
end
|
91
106
|
else
|
92
107
|
raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
|
93
108
|
end
|
@@ -102,19 +117,28 @@ module Macroape
|
|
102
117
|
# Also two command line options to fail on skipping or to skip silently should be included
|
103
118
|
|
104
119
|
info = OpenStruct.new(rough: {}, precise: {})
|
105
|
-
pwm.
|
120
|
+
pwm.set_parameters(background: background, max_hash_size: max_hash_size)
|
121
|
+
skip_motif = false
|
106
122
|
|
107
123
|
pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
108
|
-
|
124
|
+
if real_pvalue == 0
|
125
|
+
$stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
|
126
|
+
else
|
127
|
+
info.rough[pvalue] = threshold / rough_discretization
|
128
|
+
end
|
109
129
|
end
|
110
130
|
|
111
131
|
pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
112
|
-
|
132
|
+
if real_pvalue == 0
|
133
|
+
$stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
|
134
|
+
skip_motif = true
|
135
|
+
else
|
136
|
+
info.precise[pvalue] = threshold / precise_discretization
|
137
|
+
end
|
113
138
|
end
|
114
|
-
|
115
|
-
collection.add_pm(pwm, info)
|
139
|
+
collection.add_pm(pwm, info) unless skip_motif
|
116
140
|
end
|
117
|
-
File.open(output_file,'w') do |f|
|
141
|
+
File.open(output_file, 'w') do |f|
|
118
142
|
f.puts(collection.to_yaml)
|
119
143
|
end
|
120
144
|
rescue => err
|