macroape 3.3.4 → 3.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -2
- data/TODO.txt +1 -4
- data/lib/macroape/cli/eval_alignment.rb +8 -5
- data/lib/macroape/cli/eval_similarity.rb +7 -5
- data/lib/macroape/cli/find_pvalue.rb +4 -2
- data/lib/macroape/cli/find_threshold.rb +4 -3
- data/lib/macroape/cli/preprocess_collection.rb +1 -1
- data/lib/macroape/cli/scan_collection.rb +4 -2
- data/lib/macroape/version.rb +1 -1
- data/test/eval_alignment_test.rb +4 -0
- data/test/eval_similarity_test.rb +3 -0
- data/test/find_pvalue_test.rb +3 -0
- data/test/find_threshold_test.rb +5 -0
- data/test/scan_collection_test.rb +4 -0
- metadata +2 -2
data/README.md
CHANGED
@@ -20,7 +20,7 @@ Or install it yourself as:
|
|
20
20
|
For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
|
21
21
|
|
22
22
|
## Basic usage as a command-line tool
|
23
|
-
MacroAPE have
|
23
|
+
MacroAPE have 7 command line tools:
|
24
24
|
|
25
25
|
### Tools for calculating thresholds and pvalues:
|
26
26
|
* find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
|
@@ -34,8 +34,12 @@ Or install it yourself as:
|
|
34
34
|
* preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
|
35
35
|
* scan_collection \<query PWM file\> \<collection file\>
|
36
36
|
|
37
|
+
### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
|
38
|
+
* align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
|
39
|
+
|
37
40
|
Also you can use -h option to print help for a tool in console.
|
38
41
|
There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
|
42
|
+
Some of tools also can process PCMs in addition to PWMs.
|
39
43
|
|
40
44
|
## Basic usage in your code
|
41
45
|
require 'macroape'
|
@@ -51,7 +55,7 @@ Or install it yourself as:
|
|
51
55
|
similarity_info = cmp.jaccard(first_threshold, second_threshold)
|
52
56
|
puts "Jaccard similarity: #{similarity_info[:similarity]}"
|
53
57
|
|
54
|
-
For more details look a source code of utilities in lib/
|
58
|
+
For more details look a source code of utilities in lib/macroape/cli/ folder
|
55
59
|
|
56
60
|
## Contributing
|
57
61
|
|
data/TODO.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
Absolutely necessary:
|
2
|
-
(already work in preprocess_colleсtion and align_motifs) Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
|
3
|
-
|
4
1
|
Specs and tests:
|
5
2
|
create spec on use of MaxHashSize, MaxHashSizeDouble
|
6
3
|
create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
|
@@ -17,5 +14,5 @@ Usability issues:
|
|
17
14
|
|
18
15
|
remove .stdin placeholder. Use tty? method instead
|
19
16
|
|
20
|
-
use OptionParser
|
17
|
+
use OptionParser or docopt
|
21
18
|
make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
|
@@ -44,7 +44,8 @@ module Macroape
|
|
44
44
|
second_background = [1,1,1,1]
|
45
45
|
max_hash_size = 1000000
|
46
46
|
max_pair_hash_size = 1000
|
47
|
-
|
47
|
+
|
48
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
48
49
|
|
49
50
|
first_file = argv.shift
|
50
51
|
second_file = argv.shift
|
@@ -93,18 +94,20 @@ module Macroape
|
|
93
94
|
parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
|
94
95
|
|
95
96
|
if first_file == '.stdin'
|
96
|
-
|
97
|
+
input_first = parser.parse
|
97
98
|
else
|
98
99
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
99
|
-
|
100
|
+
input_first = File.read(first_file)
|
100
101
|
end
|
102
|
+
pwm_first = data_model.new(input_first).to_pwm
|
101
103
|
|
102
104
|
if second_file == '.stdin'
|
103
|
-
|
105
|
+
input_second = parser.parse
|
104
106
|
else
|
105
107
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
106
|
-
|
108
|
+
input_second = File.read(second_file)
|
107
109
|
end
|
110
|
+
pwm_second = data_model.new(input_second).to_pwm
|
108
111
|
|
109
112
|
pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
110
113
|
pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
@@ -47,7 +47,7 @@ module Macroape
|
|
47
47
|
max_hash_size = 1000000
|
48
48
|
max_pair_hash_size = 1000
|
49
49
|
|
50
|
-
|
50
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
51
51
|
first_file = argv.shift
|
52
52
|
second_file = argv.shift
|
53
53
|
raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
|
@@ -76,18 +76,20 @@ module Macroape
|
|
76
76
|
parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
|
77
77
|
|
78
78
|
if first_file == '.stdin'
|
79
|
-
|
79
|
+
input_first = parser.parse
|
80
80
|
else
|
81
81
|
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
82
|
-
|
82
|
+
input_first = File.read(first_file)
|
83
83
|
end
|
84
|
+
pwm_first = data_model.new(input_first).to_pwm
|
84
85
|
|
85
86
|
if second_file == '.stdin'
|
86
|
-
|
87
|
+
input_second = parser.parse
|
87
88
|
else
|
88
89
|
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
89
|
-
|
90
|
+
input_second = File.read(second_file)
|
90
91
|
end
|
92
|
+
pwm_second = data_model.new(input_second).to_pwm
|
91
93
|
|
92
94
|
pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
93
95
|
pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
|
@@ -41,6 +41,7 @@ module Macroape
|
|
41
41
|
thresholds = []
|
42
42
|
max_hash_size = 1000000
|
43
43
|
|
44
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
44
45
|
filename = argv.shift
|
45
46
|
|
46
47
|
loop do
|
@@ -68,11 +69,12 @@ module Macroape
|
|
68
69
|
|
69
70
|
|
70
71
|
if filename == '.stdin'
|
71
|
-
|
72
|
+
input = $stdin.read
|
72
73
|
else
|
73
74
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
74
|
-
|
75
|
+
input = File.read(filename)
|
75
76
|
end
|
77
|
+
pwm = data_model.new(input).to_pwm
|
76
78
|
pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
|
77
79
|
|
78
80
|
counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
|
@@ -36,6 +36,7 @@ module Macroape
|
|
36
36
|
default_pvalues = [0.0005]
|
37
37
|
discretization = 10000
|
38
38
|
max_hash_size = 1000000
|
39
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
39
40
|
|
40
41
|
filename = argv.shift
|
41
42
|
raise "No input. You'd specify input source: filename or .stdin" unless filename
|
@@ -63,12 +64,12 @@ module Macroape
|
|
63
64
|
pvalues = default_pvalues if pvalues.empty?
|
64
65
|
|
65
66
|
if filename == '.stdin'
|
66
|
-
|
67
|
+
input = $stdin.read
|
67
68
|
else
|
68
69
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
69
|
-
|
70
|
+
input = File.read(filename)
|
70
71
|
end
|
71
|
-
|
72
|
+
pwm = data_model.new(input).to_pwm
|
72
73
|
pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
|
73
74
|
|
74
75
|
pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
|
@@ -74,7 +74,7 @@ module Macroape
|
|
74
74
|
collection = Macroape::Collection.new(rough_discretization, precise_discretization, background, pvalues)
|
75
75
|
|
76
76
|
if File.directory?(data_source)
|
77
|
-
|
77
|
+
motifs = Dir.glob(File.join(data_source,'*')).map do |filename|
|
78
78
|
pwm = data_model.new(File.read(filename))
|
79
79
|
pwm.name ||= File.basename(filename, File.extname(filename))
|
80
80
|
pwm
|
@@ -36,6 +36,7 @@ module Macroape
|
|
36
36
|
exit
|
37
37
|
end
|
38
38
|
|
39
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
39
40
|
filename = argv.shift
|
40
41
|
collection_file = argv.shift
|
41
42
|
raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
|
@@ -82,12 +83,13 @@ module Macroape
|
|
82
83
|
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
83
84
|
|
84
85
|
if filename == '.stdin'
|
85
|
-
|
86
|
+
query_input = $stdin.read
|
86
87
|
else
|
87
88
|
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
88
|
-
|
89
|
+
query_input = File.read(filename)
|
89
90
|
end
|
90
91
|
|
92
|
+
query_pwm = data_model.new(query_input).to_pwm
|
91
93
|
query_pwm.background(background_query).max_hash_size(max_hash_size)
|
92
94
|
|
93
95
|
query_pwm_rough = query_pwm.discrete(collection.rough_discretization)
|
data/lib/macroape/version.rb
CHANGED
data/test/eval_alignment_test.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
3
|
class TestEvalAlignment < Test::Unit::TestCase
|
4
|
+
def test_process_pcm_files
|
5
|
+
assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_alignment_output('test/data/KLF4_f2.pcm test/data/SP1_f1.pcm -1 direct --pcm')
|
6
|
+
end
|
7
|
+
|
4
8
|
def test_process_at_optimal_alignment
|
5
9
|
assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_alignment_output('test/data/KLF4_f2.pat test/data/SP1_f1.pat -1 direct')
|
6
10
|
end
|
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
3
|
class TestEvalSimilarity < Test::Unit::TestCase
|
4
|
+
def test_process_pair_of_pcms
|
5
|
+
assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_similarity_output('test/data/KLF4_f2.pcm test/data/SP1_f1.pcm --pcm')
|
6
|
+
end
|
4
7
|
def test_process_pair_of_pwms
|
5
8
|
assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_similarity_output('test/data/KLF4_f2.pat test/data/SP1_f1.pat')
|
6
9
|
end
|
data/test/find_pvalue_test.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
3
|
class FindPvalueTest < Test::Unit::TestCase
|
4
|
+
def test_process_pcm
|
5
|
+
assert_equal "4.1719\t1048.0\t0.00099945068359375\n", Helpers.find_pvalue_output('test/data/KLF4_f2.pcm 4.1719 --pcm')
|
6
|
+
end
|
4
7
|
def test_process_one_threshold
|
5
8
|
assert_equal "4.1719\t1048.0\t0.00099945068359375\n", Helpers.find_pvalue_output('test/data/KLF4_f2.pat 4.1719')
|
6
9
|
end
|
data/test/find_threshold_test.rb
CHANGED
@@ -10,6 +10,11 @@ class FindThresholdTest < Test::Unit::TestCase
|
|
10
10
|
}
|
11
11
|
assert_equal pvalues, ['0.0005', '0.001']
|
12
12
|
end
|
13
|
+
def test_process_pcm
|
14
|
+
pvalue, threshold, real_pvalue = Helpers.find_threshold_output('test/data/KLF4_f2.pcm -p 0.001 --pcm').strip.split("\t")
|
15
|
+
assert_equal '0.001', pvalue
|
16
|
+
assert_equal Helpers.obtain_pvalue_by_threshold("test/data/KLF4_f2.pat #{threshold}"), real_pvalue
|
17
|
+
end
|
13
18
|
def test_process_one_pvalue
|
14
19
|
pvalue, threshold, real_pvalue = Helpers.find_threshold_output('test/data/KLF4_f2.pat -p 0.001').strip.split("\t")
|
15
20
|
assert_equal '0.001', pvalue
|
@@ -1,6 +1,10 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
3
|
class TestScanCollection < Test::Unit::TestCase
|
4
|
+
def test_scan_pcm
|
5
|
+
assert_equal File.read('test/data/KLF4_f2_scan_results_default_cutoff.txt').gsub("\r\n", "\n"),
|
6
|
+
Helpers.scan_collection_output('test/data/KLF4_f2.pcm test/data/test_collection.yaml --silent --pcm').gsub("\r\n","\n")
|
7
|
+
end
|
4
8
|
def test_scan_default_cutoff
|
5
9
|
assert_equal File.read('test/data/KLF4_f2_scan_results_default_cutoff.txt').gsub("\r\n", "\n"),
|
6
10
|
Helpers.scan_collection_output('test/data/KLF4_f2.pat test/data/test_collection.yaml --silent').gsub("\r\n","\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: macroape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bioinform
|