macroape 3.3.7 → 3.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/Rakefile.rb +6 -6
- data/TODO.txt +23 -3
- data/benchmark/similarity_benchmark.rb +18 -18
- data/lib/macroape/aligned_pair_intersection.rb +4 -4
- data/lib/macroape/cli/align_motifs.rb +34 -28
- data/lib/macroape/cli/eval_alignment.rb +73 -47
- data/lib/macroape/cli/eval_similarity.rb +65 -40
- data/lib/macroape/cli/find_pvalue.rb +30 -34
- data/lib/macroape/cli/find_threshold.rb +52 -41
- data/lib/macroape/cli/preprocess_collection.rb +68 -58
- data/lib/macroape/cli/scan_collection.rb +89 -73
- data/lib/macroape/cli.rb +184 -1
- data/lib/macroape/counting.rb +31 -5
- data/lib/macroape/pwm_compare.rb +8 -2
- data/lib/macroape/pwm_compare_aligned.rb +15 -10
- data/lib/macroape/version.rb +2 -1
- data/macroape.gemspec +2 -1
- data/spec/count_distribution_spec.rb +11 -11
- data/test/align_motifs_test.rb +16 -4
- data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
- data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
- data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/KLF4_f2_scan_results_all.txt +1 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
- data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
- data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
- data/test/data/collection_pcm_without_thresholds.yaml +186 -183
- data/test/data/collection_without_thresholds.yaml +186 -183
- data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
- data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
- data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
- data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/test_collection.yaml +179 -176
- data/test/data/test_collection_weak.yaml +214 -0
- data/test/eval_alignment_test.rb +97 -21
- data/test/eval_similarity_test.rb +104 -26
- data/test/find_pvalue_test.rb +22 -9
- data/test/find_threshold_test.rb +76 -25
- data/test/preprocess_collection_test.rb +16 -21
- data/test/scan_collection_test.rb +26 -14
- data/test/test_helper.rb +96 -12
- metadata +44 -24
data/README.md
CHANGED
@@ -23,8 +23,8 @@ Or install it yourself as:
|
|
23
23
|
MacroAPE have 7 command line tools:
|
24
24
|
|
25
25
|
### Tools for calculating thresholds and pvalues:
|
26
|
-
* find_threshold \<PWM file\> [
|
27
|
-
* find_pvalue \<PWM file\> \<threshold
|
26
|
+
* find_threshold \<PWM file\> [\<pvalue(by default: 0.0005)\>...]
|
27
|
+
* find_pvalue \<PWM file\> \<threshold\>...
|
28
28
|
|
29
29
|
### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
|
30
30
|
* eval_similarity \<first PWM file\> \<second PWM file\>
|
data/Rakefile.rb
CHANGED
@@ -8,7 +8,7 @@ namespace :spec do
|
|
8
8
|
t.libs << "test"
|
9
9
|
t.test_files = FileList['test/*_test.rb']
|
10
10
|
t.verbose = true
|
11
|
-
end
|
11
|
+
end
|
12
12
|
RSpec::Core::RakeTask.new
|
13
13
|
end
|
14
14
|
|
@@ -19,23 +19,23 @@ namespace :benchmark do
|
|
19
19
|
task :run do
|
20
20
|
require 'open3'
|
21
21
|
time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
|
22
|
-
File.open('benchmark/benchmark.log','a') do |f|
|
23
|
-
f.puts "=========================================================\n#{time}\n"
|
22
|
+
File.open('benchmark/benchmark.log','a') do |f|
|
23
|
+
f.puts "=========================================================\n#{time}\n"
|
24
24
|
Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
|
25
25
|
Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
|
26
26
|
benchmark_name = File.basename(benchmark_filename)
|
27
27
|
out_str = out.read
|
28
28
|
err_str = err.read
|
29
|
-
|
29
|
+
|
30
30
|
benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
|
31
31
|
benchmark_infos_to_file = benchmark_infos
|
32
32
|
puts benchmark_infos
|
33
|
-
|
33
|
+
|
34
34
|
if err_str && !err_str.empty?
|
35
35
|
STDERR.puts(err_str)
|
36
36
|
benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
# add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
|
40
40
|
f.puts benchmark_infos_to_file
|
41
41
|
end
|
data/TODO.txt
CHANGED
@@ -1,3 +1,23 @@
|
|
1
|
+
ToDo:
|
2
|
+
6)
|
3
|
+
# TODO: FIX: this test fails due to floating point precision error: estimated threshold is -19.0418 but '-19.0418'.to_f * 10000 = -190417.99999999997
|
4
|
+
# A workaround exists: we can use fractions, i.e. ('-19.0418'.to_r * 10000).to_f = -190418.0 but it obscures code and being used uncarefully can involve huge slowdown.
|
5
|
+
# I think, it'd be used only at input to workaround discretization issue
|
6
|
+
#
|
7
|
+
# def test_process_large_pvalue_floating_point_error
|
8
|
+
# pvalue, threshold, real_pvalue = nil, nil, nil
|
9
|
+
# assert_nothing_raised {
|
10
|
+
# pvalue, threshold, real_pvalue = Helpers.find_threshold_output('KLF4_f2.pwm -p 0.8').strip.split("\t")
|
11
|
+
# }
|
12
|
+
# assert_equal '0.8', pvalue
|
13
|
+
# assert_equal Helpers.obtain_pvalue_by_threshold("KLF4_f2.pwm #{threshold}"), real_pvalue
|
14
|
+
# end
|
15
|
+
7)thresholds and thresholds_weak should return a collection (Array or Hash) when block not given
|
16
|
+
merge this two methods into one parametrized method
|
17
|
+
8)(TODO: for theoretically consistency, while making small inconsistences to old calculations)
|
18
|
+
When we work with strong threshold, we round matrix up(in order to overrate threshold comparing to real thus taking underrated pvalue) and take upper bound of discrete-thresholds fork.
|
19
|
+
When we are estimating lower bound of threshold (weak threshold) we take lower bound of fork of discrete thresholds. But we should ALSO (not done yet) take matrix discreted down! This'd allow us give exact answer on a question in which range real threshold should lay with given P-value, now we correctly estimate only lower bound of threshold(upper bound of P-value)
|
20
|
+
|
1
21
|
Specs and tests:
|
2
22
|
create spec on use of MaxHashSize, MaxHashSizeDouble
|
3
23
|
create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
|
@@ -8,11 +28,11 @@ Ideas to increase perfomance:
|
|
8
28
|
- (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
|
9
29
|
- Create JAVA extension for alignment_intersection methods in order to increase perfomance
|
10
30
|
- Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
|
11
|
-
|
12
|
-
Usability issues:
|
31
|
+
|
32
|
+
Usability issues:
|
13
33
|
make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
|
14
34
|
|
15
35
|
remove .stdin placeholder. Use tty? method instead
|
16
36
|
|
17
37
|
use OptionParser or docopt
|
18
|
-
make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
|
38
|
+
make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)z
|
@@ -3,15 +3,15 @@ require_relative 'benchmark_helper'
|
|
3
3
|
class TaskToBenchmark
|
4
4
|
def setup
|
5
5
|
@matrix_first = "KLF4_f2.xml
|
6
|
-
0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
|
7
|
-
-1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
|
8
|
-
-2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
|
9
|
-
-2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
|
10
|
-
-0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
|
11
|
-
-1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
|
12
|
-
-2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
|
13
|
-
-1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
|
14
|
-
-2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
|
6
|
+
0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
|
7
|
+
-1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
|
8
|
+
-2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
|
9
|
+
-2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
|
10
|
+
-0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
|
11
|
+
-1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
|
12
|
+
-2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
|
13
|
+
-1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
|
14
|
+
-2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
|
15
15
|
-1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
|
16
16
|
"
|
17
17
|
|
@@ -27,25 +27,25 @@ class TaskToBenchmark
|
|
27
27
|
-0.4450938582835542 -2.2510053061629707 1.126543157436868 -1.7780413702431377
|
28
28
|
-1.1896356092245055 -1.2251832285630033 1.163676006374752 -1.6080243648157357
|
29
29
|
-0.5166047365590577 0.7641033353626651 -0.28626775700282125 -0.6825482097865606"
|
30
|
-
|
30
|
+
|
31
31
|
@pvalue = 0.0005
|
32
|
-
@discretization =
|
32
|
+
@discretization = 1
|
33
33
|
@first_background, @second_background = [1,1,1,1], [1,1,1,1]
|
34
|
-
|
35
|
-
@pwm_first = Bioinform::PWM.new(@matrix_first).background
|
36
|
-
@pwm_second = Bioinform::PWM.new(@matrix_second).background
|
34
|
+
|
35
|
+
@pwm_first = Bioinform::PWM.new(@matrix_first).set_parameters(background: @first_background).discrete(@discretization)
|
36
|
+
@pwm_second = Bioinform::PWM.new(@matrix_second).set_parameters(background: @second_background).discrete(@discretization)
|
37
37
|
@cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
|
38
|
+
@first_threshold = @pwm_first.threshold(@pvalue)
|
39
|
+
@second_threshold = @pwm_second.threshold(@pvalue)
|
38
40
|
self
|
39
41
|
end
|
40
42
|
|
41
43
|
def run
|
42
|
-
|
43
|
-
second_threshold = @pwm_second.threshold(@pvalue)
|
44
|
-
info = @cmp.jaccard(first_threshold, second_threshold)
|
44
|
+
info = @cmp.jaccard(@first_threshold, @second_threshold)
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
|
-
benchmark_result =
|
48
|
+
benchmark_result = 100.times.collect do
|
49
49
|
task_to_benchmark = TaskToBenchmark.new.setup
|
50
50
|
Benchmark.measure{ task_to_benchmark.run }
|
51
51
|
end.inject(&:+)
|
@@ -18,8 +18,8 @@ module Macroape
|
|
18
18
|
[result, result]
|
19
19
|
end
|
20
20
|
end
|
21
|
-
|
22
|
-
|
21
|
+
|
22
|
+
|
23
23
|
# block has form: {|score,letter| contribution to count by `letter` with `score` }
|
24
24
|
def get_counts(threshold_first, threshold_second, &count_contribution_block)
|
25
25
|
# scores_on_first_pwm, scores_on_second_pwm --> count
|
@@ -34,7 +34,7 @@ module Macroape
|
|
34
34
|
raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
|
35
35
|
end
|
36
36
|
end
|
37
|
-
scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
|
37
|
+
scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
|
38
38
|
end
|
39
39
|
|
40
40
|
# wouldn't work without count_contribution_block
|
@@ -52,7 +52,7 @@ module Macroape
|
|
52
52
|
end
|
53
53
|
end
|
54
54
|
end
|
55
|
-
|
55
|
+
|
56
56
|
end
|
57
57
|
end
|
58
58
|
new_scores
|
@@ -1,49 +1,55 @@
|
|
1
|
+
require 'docopt'
|
1
2
|
require_relative '../../macroape'
|
2
3
|
|
3
4
|
module Macroape
|
4
5
|
module CLI
|
5
6
|
module AlignMotifs
|
6
|
-
|
7
|
+
|
7
8
|
def self.main(argv)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
STDERR.puts help_string
|
20
|
-
exit
|
21
|
-
end
|
9
|
+
doc = <<-DOCOPT.strip_doc
|
10
|
+
Align motifs tool.
|
11
|
+
It takes motifs and builds alignment of each motif to the first (leader) motif.
|
12
|
+
|
13
|
+
Output has format:
|
14
|
+
pwm_file_1 shift_1 orientation_1
|
15
|
+
pwm_file_2 shift_2 orientation_2
|
16
|
+
pwm_file_3 shift_3 orientation_3
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
align_motifs [options] <pm-files>...
|
22
20
|
|
21
|
+
Options:
|
22
|
+
-h --help Show this screen.
|
23
|
+
--pcm Use PCMs instead of PWMs as input
|
24
|
+
DOCOPT
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
+
options = Docopt::docopt(doc, argv: argv)
|
27
|
+
|
28
|
+
data_model = options['--pcm'] ? Bioinform::PCM : Bioinform::PWM
|
29
|
+
motif_files = options['<pm-files>']
|
30
|
+
leader = motif_files.first
|
26
31
|
background = [1,1,1,1]
|
27
|
-
discretization =
|
32
|
+
discretization = 1
|
28
33
|
pvalue = 0.0005
|
29
|
-
|
34
|
+
|
30
35
|
shifts = {leader => [0,:direct]}
|
31
|
-
pwm_first = data_model.new(File.read(leader)).to_pwm
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
+
pwm_first = data_model.new(File.read(leader)).to_pwm
|
37
|
+
pwm_first.set_parameters(background: background).discrete!(discretization)
|
38
|
+
motif_files[1..-1].each do |motif_name|
|
39
|
+
pwm_second = data_model.new(File.read(motif_name)).to_pwm
|
40
|
+
pwm_second.set_parameters(background: background).discrete!(discretization)
|
41
|
+
info = Macroape::PWMCompare.new(pwm_first, pwm_second).jaccard_by_pvalue(pvalue)
|
36
42
|
shifts[motif_name] = [info[:shift], info[:orientation]]
|
37
43
|
end
|
38
|
-
|
44
|
+
|
39
45
|
shifts.each do |motif_name, (shift,orientation)|
|
40
46
|
puts "#{motif_name}\t#{shift}\t#{orientation}"
|
41
47
|
end
|
42
48
|
|
43
|
-
rescue =>
|
44
|
-
|
49
|
+
rescue Docopt::Exit => e
|
50
|
+
puts e.message
|
45
51
|
end
|
46
|
-
|
52
|
+
|
47
53
|
end
|
48
54
|
end
|
49
55
|
end
|
@@ -3,48 +3,40 @@ require_relative '../../macroape'
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
5
5
|
module EvalAlignment
|
6
|
-
|
6
|
+
|
7
7
|
def self.main(argv)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
29
|
-
or on windows
|
30
|
-
type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
31
|
-
or in linux
|
32
|
-
cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
|
33
|
-
}
|
34
|
-
|
35
|
-
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
36
|
-
STDERR.puts help_string
|
8
|
+
doc = <<-EOS.strip_doc
|
9
|
+
Command-line format:
|
10
|
+
#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
|
11
|
+
|
12
|
+
Options:
|
13
|
+
[-p <P-value>]
|
14
|
+
[-d <discretization level>]
|
15
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
16
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
17
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
18
|
+
[--first-threshold <threshold for the first matrix>]
|
19
|
+
[--second-threshold <threshold for the second matrix>]
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
|
23
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat 3 revcomp
|
24
|
+
EOS
|
25
|
+
|
26
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
27
|
+
STDERR.puts doc
|
37
28
|
exit
|
38
29
|
end
|
39
30
|
|
40
31
|
pvalue = 0.0005
|
41
|
-
discretization = 10
|
32
|
+
discretization = 10.0
|
42
33
|
|
43
34
|
first_background = [1,1,1,1]
|
44
35
|
second_background = [1,1,1,1]
|
45
|
-
max_hash_size =
|
46
|
-
max_pair_hash_size =
|
47
|
-
|
36
|
+
max_hash_size = 10000000
|
37
|
+
max_pair_hash_size = 10000
|
38
|
+
pvalue_boundary = :upper
|
39
|
+
|
48
40
|
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
49
41
|
|
50
42
|
first_file = argv.shift
|
@@ -53,9 +45,9 @@ module Macroape
|
|
53
45
|
shift = argv.shift
|
54
46
|
orientation = argv.shift
|
55
47
|
|
56
|
-
raise
|
57
|
-
raise 'You
|
58
|
-
raise 'You
|
48
|
+
raise 'You should specify two input sources (each is filename or .stdin)' unless first_file and second_file
|
49
|
+
raise 'You should specify shift' unless shift
|
50
|
+
raise 'You should specify orientation' unless orientation
|
59
51
|
|
60
52
|
shift = shift.to_i
|
61
53
|
orientation = orientation.to_sym
|
@@ -76,16 +68,23 @@ module Macroape
|
|
76
68
|
pvalue = argv.shift.to_f
|
77
69
|
when '-d'
|
78
70
|
discretization = argv.shift.to_f
|
79
|
-
when '-
|
71
|
+
when '--max-hash-size'
|
80
72
|
max_hash_size = argv.shift.to_i
|
81
|
-
when '-
|
73
|
+
when '--max-2d-hash-size'
|
82
74
|
max_pair_hash_size = argv.shift.to_i
|
83
75
|
when '-b'
|
84
|
-
second_background = first_background = argv.shift(
|
76
|
+
second_background = first_background = argv.shift.split(',').map(&:to_f)
|
85
77
|
when '-b1'
|
86
|
-
first_background = argv.shift(
|
78
|
+
first_background = argv.shift.split(',').map(&:to_f)
|
87
79
|
when '-b2'
|
88
|
-
second_background = argv.shift(
|
80
|
+
second_background = argv.shift.split(',').map(&:to_f)
|
81
|
+
when '--boundary'
|
82
|
+
pvalue_boundary = argv.shift.to_sym
|
83
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
84
|
+
when '--first-threshold'
|
85
|
+
predefined_threshold_first = argv.shift.to_f
|
86
|
+
when '--second-threshold'
|
87
|
+
predefined_threshold_second = argv.shift.to_f
|
89
88
|
end
|
90
89
|
end
|
91
90
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
@@ -95,7 +94,7 @@ module Macroape
|
|
95
94
|
input = $stdin.read
|
96
95
|
parser = data_model.choose_parser(input).new(input)
|
97
96
|
end
|
98
|
-
|
97
|
+
|
99
98
|
if first_file == '.stdin'
|
100
99
|
input_first = parser.parse
|
101
100
|
else
|
@@ -111,18 +110,45 @@ module Macroape
|
|
111
110
|
input_second = File.read(second_file)
|
112
111
|
end
|
113
112
|
pwm_second = data_model.new(input_second).to_pwm
|
114
|
-
|
113
|
+
|
115
114
|
pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
|
116
115
|
pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
|
117
116
|
|
118
117
|
cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
|
119
118
|
|
120
|
-
|
119
|
+
if predefined_threshold_first
|
120
|
+
threshold_first = predefined_threshold_first * discretization
|
121
|
+
else
|
122
|
+
if pvalue_boundary == :lower
|
123
|
+
threshold_first = pwm_first.threshold(pvalue)
|
124
|
+
else
|
125
|
+
threshold_first = pwm_first.weak_threshold(pvalue)
|
126
|
+
end
|
127
|
+
end
|
121
128
|
|
122
|
-
|
129
|
+
if predefined_threshold_second
|
130
|
+
threshold_second = predefined_threshold_second * discretization
|
131
|
+
else
|
132
|
+
if pvalue_boundary == :lower
|
133
|
+
threshold_second = pwm_second.threshold(pvalue)
|
134
|
+
else
|
135
|
+
threshold_second = pwm_second.weak_threshold(pvalue)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
info = cmp.alignment_infos.merge( cmp.jaccard(threshold_first, threshold_second) )
|
139
|
+
info.merge!(predefined_threshold_first: predefined_threshold_first,
|
140
|
+
predefined_threshold_second: predefined_threshold_second,
|
141
|
+
threshold_first: threshold_first / discretization,
|
142
|
+
threshold_second: threshold_second / discretization,
|
143
|
+
discretization: discretization,
|
144
|
+
first_background: first_background,
|
145
|
+
second_background: second_background,
|
146
|
+
requested_pvalue: pvalue,
|
147
|
+
pvalue_boundary: pvalue_boundary)
|
148
|
+
puts Helper.similarity_info_string(info)
|
123
149
|
|
124
150
|
rescue => err
|
125
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
151
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
126
152
|
end
|
127
153
|
|
128
154
|
end
|
@@ -3,54 +3,44 @@ require_relative '../../macroape'
|
|
3
3
|
module Macroape
|
4
4
|
module CLI
|
5
5
|
module EvalSimilarity
|
6
|
-
|
6
|
+
|
7
7
|
def self.main(argv)
|
8
|
-
|
8
|
+
doc = <<-EOS.strip_doc
|
9
9
|
Command-line format:
|
10
|
-
|
11
|
-
or on windows
|
12
|
-
type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
13
|
-
or in linux
|
14
|
-
cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
|
10
|
+
#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
|
15
11
|
|
16
12
|
Options:
|
17
13
|
[-p <P-value>]
|
18
14
|
[-d <discretization level>]
|
19
|
-
[-
|
20
|
-
|
21
|
-
|
22
|
-
<
|
23
|
-
|
24
|
-
<optimal alignment, the 1st matrix>
|
25
|
-
<optimal alignment, the 2nd matrix>
|
26
|
-
<shift> <orientation>
|
15
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
16
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
17
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
18
|
+
[--first-threshold <threshold for the first matrix>]
|
19
|
+
[--second-threshold <threshold for the second matrix>]
|
27
20
|
|
28
21
|
Examples:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
}
|
35
|
-
|
36
|
-
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
37
|
-
STDERR.puts help_string
|
22
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
|
23
|
+
EOS
|
24
|
+
|
25
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
26
|
+
STDERR.puts doc
|
38
27
|
exit
|
39
28
|
end
|
40
29
|
|
41
30
|
pvalue = 0.0005
|
42
|
-
discretization = 10
|
31
|
+
discretization = 10.0
|
43
32
|
|
44
33
|
first_background = [1,1,1,1]
|
45
34
|
second_background = [1,1,1,1]
|
46
35
|
|
47
|
-
max_hash_size =
|
48
|
-
max_pair_hash_size =
|
36
|
+
max_hash_size = 10000000
|
37
|
+
max_pair_hash_size = 10000
|
38
|
+
pvalue_boundary = :upper
|
49
39
|
|
50
|
-
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
40
|
+
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
51
41
|
first_file = argv.shift
|
52
42
|
second_file = argv.shift
|
53
|
-
raise
|
43
|
+
raise 'You should specify two input files' unless first_file and second_file
|
54
44
|
|
55
45
|
until argv.empty?
|
56
46
|
case argv.shift
|
@@ -58,16 +48,23 @@ module Macroape
|
|
58
48
|
pvalue = argv.shift.to_f
|
59
49
|
when '-d'
|
60
50
|
discretization = argv.shift.to_f
|
61
|
-
when '-
|
51
|
+
when '--max-hash-size'
|
62
52
|
max_hash_size = argv.shift.to_i
|
63
|
-
when '-
|
53
|
+
when '--max-2d-hash-size'
|
64
54
|
max_pair_hash_size = argv.shift.to_i
|
65
55
|
when '-b'
|
66
|
-
second_background = first_background = argv.shift(
|
56
|
+
second_background = first_background = argv.shift.split(',').map(&:to_f)
|
67
57
|
when '-b1'
|
68
|
-
first_background = argv.shift(
|
58
|
+
first_background = argv.shift.split(',').map(&:to_f)
|
69
59
|
when '-b2'
|
70
|
-
second_background = argv.shift(
|
60
|
+
second_background = argv.shift.split(',').map(&:to_f)
|
61
|
+
when '--boundary'
|
62
|
+
pvalue_boundary = argv.shift.to_sym
|
63
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
64
|
+
when '--first-threshold'
|
65
|
+
predefined_threshold_first = argv.shift.to_f
|
66
|
+
when '--second-threshold'
|
67
|
+
predefined_threshold_second = argv.shift.to_f
|
71
68
|
end
|
72
69
|
end
|
73
70
|
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
|
@@ -77,7 +74,7 @@ module Macroape
|
|
77
74
|
input = $stdin.read
|
78
75
|
parser = data_model.choose_parser(input).new(input)
|
79
76
|
end
|
80
|
-
|
77
|
+
|
81
78
|
if first_file == '.stdin'
|
82
79
|
input_first = parser.parse
|
83
80
|
else
|
@@ -93,20 +90,48 @@ module Macroape
|
|
93
90
|
input_second = File.read(second_file)
|
94
91
|
end
|
95
92
|
pwm_second = data_model.new(input_second).to_pwm
|
96
|
-
|
93
|
+
|
97
94
|
pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
|
98
95
|
pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
|
99
96
|
|
100
97
|
cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
|
101
98
|
|
102
|
-
|
99
|
+
if predefined_threshold_first
|
100
|
+
threshold_first = predefined_threshold_first * discretization
|
101
|
+
else
|
102
|
+
if pvalue_boundary == :lower
|
103
|
+
threshold_first = pwm_first.threshold(pvalue)
|
104
|
+
else
|
105
|
+
threshold_first = pwm_first.weak_threshold(pvalue)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
if predefined_threshold_second
|
110
|
+
threshold_second = predefined_threshold_second * discretization
|
111
|
+
else
|
112
|
+
if pvalue_boundary == :lower
|
113
|
+
threshold_second = pwm_second.threshold(pvalue)
|
114
|
+
else
|
115
|
+
threshold_second = pwm_second.weak_threshold(pvalue)
|
116
|
+
end
|
117
|
+
end
|
103
118
|
|
104
|
-
|
119
|
+
info = cmp.jaccard(threshold_first, threshold_second)
|
120
|
+
info.merge!(predefined_threshold_first: predefined_threshold_first,
|
121
|
+
predefined_threshold_second: predefined_threshold_second,
|
122
|
+
threshold_first: threshold_first.to_f / discretization,
|
123
|
+
threshold_second: threshold_second.to_f / discretization,
|
124
|
+
discretization: discretization,
|
125
|
+
first_background: first_background,
|
126
|
+
second_background: second_background,
|
127
|
+
requested_pvalue: pvalue,
|
128
|
+
pvalue_boundary: pvalue_boundary)
|
129
|
+
puts Helper.similarity_info_string(info)
|
105
130
|
|
106
131
|
rescue => err
|
107
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
132
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
108
133
|
end
|
109
|
-
|
134
|
+
|
110
135
|
end
|
111
136
|
end
|
112
137
|
end
|