macroape 4.0.2 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +22 -22
- data/README.md +70 -70
- data/Rakefile.rb +49 -49
- data/TODO.txt +46 -46
- data/benchmark/benchmark_helper.rb +4 -4
- data/benchmark/similarity_benchmark.rb +52 -52
- data/bin/align_motifs +4 -4
- data/bin/eval_alignment +4 -4
- data/bin/eval_similarity +4 -4
- data/bin/find_pvalue +4 -4
- data/bin/find_threshold +4 -4
- data/bin/preprocess_collection +4 -4
- data/bin/scan_collection +4 -4
- data/lib/macroape.rb +14 -11
- data/lib/macroape/aligned_pair_intersection.rb +61 -62
- data/lib/macroape/cli.rb +191 -188
- data/lib/macroape/cli/align_motifs.rb +120 -100
- data/lib/macroape/cli/eval_alignment.rb +157 -156
- data/lib/macroape/cli/eval_similarity.rb +138 -137
- data/lib/macroape/cli/find_pvalue.rb +93 -87
- data/lib/macroape/cli/find_threshold.rb +103 -96
- data/lib/macroape/cli/preprocess_collection.rb +169 -161
- data/lib/macroape/cli/scan_collection.rb +171 -163
- data/lib/macroape/collection.rb +29 -0
- data/lib/macroape/motif_with_thresholds.rb +18 -0
- data/lib/macroape/pwm_compare.rb +39 -44
- data/lib/macroape/pwm_compare_aligned.rb +139 -130
- data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
- data/lib/macroape/support/inverf.rb +13 -0
- data/lib/macroape/support/partial_sums.rb +17 -0
- data/lib/macroape/version.rb +4 -4
- data/macroape.gemspec +19 -19
- data/spec/count_distribution_spec.rb +112 -109
- data/spec/inverf_spec.rb +23 -0
- data/spec/partial_sums_spec.rb +28 -0
- data/spec/spec_helper.rb +11 -11
- data/test/align_motifs_test.rb +42 -43
- data/test/data/AHR_si.pwm +10 -10
- data/test/data/KLF3_f1.pcm +16 -16
- data/test/data/KLF3_f1.pwm +16 -16
- data/test/data/KLF4_f2.pcm +11 -11
- data/test/data/KLF4_f2.pwm +11 -11
- data/test/data/KLF4_f2_scan_results_all.txt +2 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
- data/test/data/SP1_f1.pcm +12 -12
- data/test/data/SP1_f1.pwm +12 -12
- data/test/data/SP1_f1_revcomp.pcm +12 -12
- data/test/data/SP1_f1_revcomp.pwm +12 -12
- data/test/data/medium_motif.pwm +8 -8
- data/test/data/short_motif.pwm +7 -7
- data/test/data/test_collection.yaml +231 -214
- data/test/data/test_collection/GABPA_f1.pwm +14 -14
- data/test/data/test_collection/KLF4_f2.pwm +10 -10
- data/test/data/test_collection/SP1_f1.pwm +12 -12
- data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
- data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
- data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
- data/test/data/test_collection_single_file.txt +38 -38
- data/test/data/test_collection_single_file_pcm.txt +37 -37
- data/test/data/test_collection_weak.yaml +231 -214
- data/test/eval_alignment_test.rb +90 -111
- data/test/eval_similarity_test.rb +105 -123
- data/test/find_pvalue_test.rb +34 -39
- data/test/find_threshold_test.rb +87 -91
- data/test/preprocess_collection_test.rb +56 -65
- data/test/scan_collection_test.rb +42 -48
- data/test/test_helper.rb +159 -160
- metadata +14 -10
- data/test/data/collection_pcm_without_thresholds.yaml +0 -188
- data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,100 +1,120 @@
|
|
1
|
-
require_relative '../../macroape'
|
2
|
-
require 'shellwords'
|
3
|
-
|
4
|
-
module Macroape
|
5
|
-
module CLI
|
6
|
-
module AlignMotifs
|
7
|
-
|
8
|
-
def self.main(argv)
|
9
|
-
doc = <<-EOS.strip_doc
|
10
|
-
Align motifs tool.
|
11
|
-
It takes motifs and builds alignment of each motif to the first (leader) motif.
|
12
|
-
|
13
|
-
Output has format:
|
14
|
-
pwm_file_1 shift_1 orientation_1
|
15
|
-
pwm_file_2 shift_2 orientation_2
|
16
|
-
pwm_file_3 shift_3 orientation_3
|
17
|
-
|
18
|
-
Usage:
|
19
|
-
#{run_tool_cmd} [options] <leader pm> <rest pm files>...
|
20
|
-
or
|
21
|
-
ls rest_pms/*.pm | #{run_tool_cmd} [options]
|
22
|
-
or
|
23
|
-
ls rest_pms/*.pm | #{run_tool_cmd} [options] <leader pm>
|
24
|
-
|
25
|
-
Options:
|
26
|
-
[-p <P-value>]
|
27
|
-
[-d <discretization level>]
|
28
|
-
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
29
|
-
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
30
|
-
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
31
|
-
EOS
|
32
|
-
|
33
|
-
if (argv.empty? && $stdin.tty?) || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
34
|
-
$stderr.puts doc
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
|
38
|
-
leader_background =
|
39
|
-
rest_motifs_background =
|
40
|
-
discretization = 1
|
41
|
-
pvalue = 0.0005
|
42
|
-
max_hash_size = 10000000
|
43
|
-
max_pair_hash_size = 10000
|
44
|
-
pvalue_boundary = :upper
|
45
|
-
|
46
|
-
data_model = argv.delete('--pcm') ?
|
47
|
-
|
48
|
-
while argv.first && argv.first.start_with?('-')
|
49
|
-
case argv.shift
|
50
|
-
when '-p'
|
51
|
-
pvalue = argv.shift.to_f
|
52
|
-
when '-d'
|
53
|
-
discretization = argv.shift.to_f
|
54
|
-
when '--max-hash-size'
|
55
|
-
max_hash_size = argv.shift.to_i
|
56
|
-
when '--max-2d-hash-size'
|
57
|
-
max_pair_hash_size = argv.shift.to_i
|
58
|
-
when '-b'
|
59
|
-
rest_motifs_background = leader_background = argv.shift
|
60
|
-
when '-b1'
|
61
|
-
leader_background = argv.shift
|
62
|
-
when '-b2'
|
63
|
-
rest_motifs_background = argv.shift
|
64
|
-
when '--boundary'
|
65
|
-
pvalue_boundary = argv.shift.to_sym
|
66
|
-
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
pwm_files = argv
|
71
|
-
pwm_files += $stdin.read.shellsplit unless $stdin.tty?
|
72
|
-
leader_pwm_file = pwm_files.first
|
73
|
-
rest_pwm_files = pwm_files[1..-1]
|
74
|
-
rest_pwm_files.reject!{|filename| File.expand_path(filename) == File.expand_path(leader_pwm_file)}
|
75
|
-
|
76
|
-
raise 'Specify leader file' unless leader_pwm_file
|
77
|
-
|
78
|
-
shifts = []
|
79
|
-
shifts << [leader_pwm_file, 0, :direct]
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
1
|
+
require_relative '../../macroape'
|
2
|
+
require 'shellwords'
|
3
|
+
|
4
|
+
module Macroape
|
5
|
+
module CLI
|
6
|
+
module AlignMotifs
|
7
|
+
|
8
|
+
def self.main(argv)
|
9
|
+
doc = <<-EOS.strip_doc
|
10
|
+
Align motifs tool.
|
11
|
+
It takes motifs and builds alignment of each motif to the first (leader) motif.
|
12
|
+
|
13
|
+
Output has format:
|
14
|
+
pwm_file_1 shift_1 orientation_1
|
15
|
+
pwm_file_2 shift_2 orientation_2
|
16
|
+
pwm_file_3 shift_3 orientation_3
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
#{run_tool_cmd} [options] <leader pm> <rest pm files>...
|
20
|
+
or
|
21
|
+
ls rest_pms/*.pm | #{run_tool_cmd} [options]
|
22
|
+
or
|
23
|
+
ls rest_pms/*.pm | #{run_tool_cmd} [options] <leader pm>
|
24
|
+
|
25
|
+
Options:
|
26
|
+
[-p <P-value>]
|
27
|
+
[-d <discretization level>]
|
28
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
29
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
30
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
31
|
+
EOS
|
32
|
+
|
33
|
+
if (argv.empty? && $stdin.tty?) || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
34
|
+
$stderr.puts doc
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
|
38
|
+
leader_background = Bioinform::Background::Wordwise
|
39
|
+
rest_motifs_background = Bioinform::Background::Wordwise
|
40
|
+
discretization = 1
|
41
|
+
pvalue = 0.0005
|
42
|
+
max_hash_size = 10000000
|
43
|
+
max_pair_hash_size = 10000
|
44
|
+
pvalue_boundary = :upper
|
45
|
+
|
46
|
+
data_model = argv.delete('--pcm') ? :pcm : :pwm
|
47
|
+
|
48
|
+
while argv.first && argv.first.start_with?('-')
|
49
|
+
case argv.shift
|
50
|
+
when '-p'
|
51
|
+
pvalue = argv.shift.to_f
|
52
|
+
when '-d'
|
53
|
+
discretization = argv.shift.to_f
|
54
|
+
when '--max-hash-size'
|
55
|
+
max_hash_size = argv.shift.to_i
|
56
|
+
when '--max-2d-hash-size'
|
57
|
+
max_pair_hash_size = argv.shift.to_i
|
58
|
+
when '-b'
|
59
|
+
rest_motifs_background = leader_background = Bioinform::Background.from_string(argv.shift)
|
60
|
+
when '-b1'
|
61
|
+
leader_background = Bioinform::Background.from_string(argv.shift)
|
62
|
+
when '-b2'
|
63
|
+
rest_motifs_background = Bioinform::Background.from_string(argv.shift)
|
64
|
+
when '--boundary'
|
65
|
+
pvalue_boundary = argv.shift.to_sym
|
66
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
pwm_files = argv
|
71
|
+
pwm_files += $stdin.read.shellsplit unless $stdin.tty?
|
72
|
+
leader_pwm_file = pwm_files.first
|
73
|
+
rest_pwm_files = pwm_files[1..-1]
|
74
|
+
rest_pwm_files.reject!{|filename| File.expand_path(filename) == File.expand_path(leader_pwm_file)}
|
75
|
+
|
76
|
+
raise 'Specify leader file' unless leader_pwm_file
|
77
|
+
|
78
|
+
shifts = []
|
79
|
+
shifts << [leader_pwm_file, 0, :direct]
|
80
|
+
|
81
|
+
input_first = File.read(leader_pwm_file)
|
82
|
+
input_first = Bioinform::MatrixParser.new.parse!(input_first)
|
83
|
+
case data_model
|
84
|
+
when :pcm
|
85
|
+
pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])
|
86
|
+
pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: leader_background).convert(pcm_first)
|
87
|
+
when :pwm
|
88
|
+
pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])
|
89
|
+
end
|
90
|
+
|
91
|
+
pwm_first = pwm_first.discreted(discretization)
|
92
|
+
counting_first = PWMCounting.new(pwm_first, background: leader_background, max_hash_size: max_hash_size)
|
93
|
+
|
94
|
+
rest_pwm_files.each do |motif_name|
|
95
|
+
input_second = File.read(motif_name)
|
96
|
+
input_second = Bioinform::MatrixParser.new.parse!(input_second)
|
97
|
+
case data_model
|
98
|
+
when :pcm
|
99
|
+
pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])
|
100
|
+
pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: rest_motifs_background).convert(pcm_second)
|
101
|
+
when :pwm
|
102
|
+
pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])
|
103
|
+
end
|
104
|
+
pwm_second = pwm_second.discreted(discretization)
|
105
|
+
counting_second = PWMCounting.new(pwm_second, background: rest_motifs_background, max_hash_size: max_hash_size)
|
106
|
+
cmp = Macroape::PWMCompare.new(counting_first, counting_second).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
|
107
|
+
info = cmp.jaccard_by_pvalue(pvalue)
|
108
|
+
shifts << [motif_name, info[:shift], info[:orientation]]
|
109
|
+
end
|
110
|
+
|
111
|
+
shifts.each do |motif_name, shift,orientation|
|
112
|
+
puts "#{motif_name}\t#{shift}\t#{orientation}"
|
113
|
+
end
|
114
|
+
rescue => err
|
115
|
+
$stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -1,156 +1,157 @@
|
|
1
|
-
require_relative '../../macroape'
|
2
|
-
|
3
|
-
module Macroape
|
4
|
-
module CLI
|
5
|
-
module EvalAlignment
|
6
|
-
|
7
|
-
def self.main(argv)
|
8
|
-
doc = <<-EOS.strip_doc
|
9
|
-
Command-line format:
|
10
|
-
#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
|
11
|
-
|
12
|
-
Options:
|
13
|
-
[-p <P-value>]
|
14
|
-
[-d <discretization level>]
|
15
|
-
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
16
|
-
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
17
|
-
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
18
|
-
[--first-threshold <threshold for the first matrix>]
|
19
|
-
[--second-threshold <threshold for the second matrix>]
|
20
|
-
|
21
|
-
Examples:
|
22
|
-
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
|
23
|
-
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat 3 revcomp
|
24
|
-
EOS
|
25
|
-
|
26
|
-
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
27
|
-
$stderr.puts doc
|
28
|
-
exit
|
29
|
-
end
|
30
|
-
|
31
|
-
pvalue = 0.0005
|
32
|
-
discretization = 10.0
|
33
|
-
|
34
|
-
first_background =
|
35
|
-
second_background =
|
36
|
-
max_hash_size = 10000000
|
37
|
-
max_pair_hash_size = 10000
|
38
|
-
pvalue_boundary = :upper
|
39
|
-
|
40
|
-
data_model = argv.delete('--pcm') ?
|
41
|
-
|
42
|
-
first_file = argv.shift
|
43
|
-
second_file = argv.shift
|
44
|
-
|
45
|
-
shift = argv.shift
|
46
|
-
orientation = argv.shift
|
47
|
-
|
48
|
-
raise 'You should specify two input
|
49
|
-
raise 'You should specify shift' unless shift
|
50
|
-
raise 'You should specify orientation' unless orientation
|
51
|
-
|
52
|
-
shift = shift.to_i
|
53
|
-
orientation = orientation.to_sym
|
54
|
-
|
55
|
-
case orientation
|
56
|
-
when :direct
|
57
|
-
reverse = false
|
58
|
-
when :revcomp
|
59
|
-
reverse = true
|
60
|
-
else
|
61
|
-
raise 'Unknown orientation(direct/revcomp)'
|
62
|
-
end
|
63
|
-
|
64
|
-
|
65
|
-
until argv.empty?
|
66
|
-
case argv.shift
|
67
|
-
when '-p'
|
68
|
-
pvalue = argv.shift.to_f
|
69
|
-
when '-d'
|
70
|
-
discretization = argv.shift.to_f
|
71
|
-
when '--max-hash-size'
|
72
|
-
max_hash_size = argv.shift.to_i
|
73
|
-
when '--max-2d-hash-size'
|
74
|
-
max_pair_hash_size = argv.shift.to_i
|
75
|
-
when '-b'
|
76
|
-
second_background = first_background = argv.shift
|
77
|
-
when '-b1'
|
78
|
-
first_background = argv.shift
|
79
|
-
when '-b2'
|
80
|
-
second_background = argv.shift
|
81
|
-
when '--boundary'
|
82
|
-
pvalue_boundary = argv.shift.to_sym
|
83
|
-
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
84
|
-
when '--first-threshold'
|
85
|
-
predefined_threshold_first = argv.shift.to_f
|
86
|
-
when '--second-threshold'
|
87
|
-
predefined_threshold_second = argv.shift.to_f
|
88
|
-
end
|
89
|
-
end
|
90
|
-
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background
|
91
|
-
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
info.merge
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
end
|
1
|
+
require_relative '../../macroape'
|
2
|
+
|
3
|
+
module Macroape
|
4
|
+
module CLI
|
5
|
+
module EvalAlignment
|
6
|
+
|
7
|
+
def self.main(argv)
|
8
|
+
doc = <<-EOS.strip_doc
|
9
|
+
Command-line format:
|
10
|
+
#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
|
11
|
+
|
12
|
+
Options:
|
13
|
+
[-p <P-value>]
|
14
|
+
[-d <discretization level>]
|
15
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
16
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
17
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
18
|
+
[--first-threshold <threshold for the first matrix>]
|
19
|
+
[--second-threshold <threshold for the second matrix>]
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
|
23
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat 3 revcomp
|
24
|
+
EOS
|
25
|
+
|
26
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
27
|
+
$stderr.puts doc
|
28
|
+
exit
|
29
|
+
end
|
30
|
+
|
31
|
+
pvalue = 0.0005
|
32
|
+
discretization = 10.0
|
33
|
+
|
34
|
+
first_background = Bioinform::Background::Wordwise
|
35
|
+
second_background = Bioinform::Background::Wordwise
|
36
|
+
max_hash_size = 10000000
|
37
|
+
max_pair_hash_size = 10000
|
38
|
+
pvalue_boundary = :upper
|
39
|
+
|
40
|
+
data_model = argv.delete('--pcm') ? :pcm : :pwm
|
41
|
+
|
42
|
+
first_file = argv.shift
|
43
|
+
second_file = argv.shift
|
44
|
+
|
45
|
+
shift = argv.shift
|
46
|
+
orientation = argv.shift
|
47
|
+
|
48
|
+
raise 'You should specify two input files' unless first_file and second_file
|
49
|
+
raise 'You should specify shift' unless shift
|
50
|
+
raise 'You should specify orientation' unless orientation
|
51
|
+
|
52
|
+
shift = shift.to_i
|
53
|
+
orientation = orientation.to_sym
|
54
|
+
|
55
|
+
case orientation
|
56
|
+
when :direct
|
57
|
+
reverse = false
|
58
|
+
when :revcomp
|
59
|
+
reverse = true
|
60
|
+
else
|
61
|
+
raise 'Unknown orientation(direct/revcomp)'
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
until argv.empty?
|
66
|
+
case argv.shift
|
67
|
+
when '-p'
|
68
|
+
pvalue = argv.shift.to_f
|
69
|
+
when '-d'
|
70
|
+
discretization = argv.shift.to_f
|
71
|
+
when '--max-hash-size'
|
72
|
+
max_hash_size = argv.shift.to_i
|
73
|
+
when '--max-2d-hash-size'
|
74
|
+
max_pair_hash_size = argv.shift.to_i
|
75
|
+
when '-b'
|
76
|
+
second_background = first_background = Bioinform::Background.from_string(argv.shift)
|
77
|
+
when '-b1'
|
78
|
+
first_background = Bioinform::Background.from_string(argv.shift)
|
79
|
+
when '-b2'
|
80
|
+
second_background = Bioinform::Background.from_string(argv.shift)
|
81
|
+
when '--boundary'
|
82
|
+
pvalue_boundary = argv.shift.to_sym
|
83
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
84
|
+
when '--first-threshold'
|
85
|
+
predefined_threshold_first = argv.shift.to_f
|
86
|
+
when '--second-threshold'
|
87
|
+
predefined_threshold_second = argv.shift.to_f
|
88
|
+
end
|
89
|
+
end
|
90
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background.symmetric?
|
91
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background.symmetric?
|
92
|
+
|
93
|
+
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
|
94
|
+
input_first = File.read(first_file)
|
95
|
+
input_first = Bioinform::MatrixParser.new.parse!(input_first)
|
96
|
+
|
97
|
+
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
|
98
|
+
input_second = File.read(second_file)
|
99
|
+
input_second = Bioinform::MatrixParser.new.parse!(input_second)
|
100
|
+
|
101
|
+
case data_model
|
102
|
+
when :pcm
|
103
|
+
pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])
|
104
|
+
pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: first_background).convert(pcm_first)
|
105
|
+
pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])
|
106
|
+
pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: second_background).convert(pcm_second)
|
107
|
+
when :pwm
|
108
|
+
pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])
|
109
|
+
pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])
|
110
|
+
end
|
111
|
+
|
112
|
+
pwm_first = pwm_first.discreted(discretization)
|
113
|
+
pwm_second = pwm_second.discreted(discretization)
|
114
|
+
|
115
|
+
counting_first = PWMCounting.new(pwm_first, background: first_background, max_hash_size: max_hash_size)
|
116
|
+
counting_second = PWMCounting.new(pwm_second, background: second_background, max_hash_size: max_hash_size)
|
117
|
+
|
118
|
+
cmp = Macroape::PWMCompareAligned.new(counting_first, counting_second, shift, orientation).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
|
119
|
+
|
120
|
+
if predefined_threshold_first
|
121
|
+
threshold_first = predefined_threshold_first * discretization
|
122
|
+
else
|
123
|
+
if pvalue_boundary == :lower
|
124
|
+
threshold_first = counting_first.threshold(pvalue)
|
125
|
+
else
|
126
|
+
threshold_first = counting_first.weak_threshold(pvalue)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
if predefined_threshold_second
|
131
|
+
threshold_second = predefined_threshold_second * discretization
|
132
|
+
else
|
133
|
+
if pvalue_boundary == :lower
|
134
|
+
threshold_second = counting_second.threshold(pvalue)
|
135
|
+
else
|
136
|
+
threshold_second = counting_second.weak_threshold(pvalue)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
info = cmp.alignment_infos.merge( cmp.jaccard(threshold_first, threshold_second) )
|
140
|
+
info.merge!(predefined_threshold_first: predefined_threshold_first,
|
141
|
+
predefined_threshold_second: predefined_threshold_second,
|
142
|
+
threshold_first: threshold_first / discretization,
|
143
|
+
threshold_second: threshold_second / discretization,
|
144
|
+
discretization: discretization,
|
145
|
+
first_background: first_background,
|
146
|
+
second_background: second_background,
|
147
|
+
requested_pvalue: pvalue,
|
148
|
+
pvalue_boundary: pvalue_boundary)
|
149
|
+
puts Helper.similarity_info_string(info)
|
150
|
+
|
151
|
+
rescue => err
|
152
|
+
$stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|