macroape 3.3.7 → 3.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/Rakefile.rb +6 -6
- data/TODO.txt +23 -3
- data/benchmark/similarity_benchmark.rb +18 -18
- data/lib/macroape/aligned_pair_intersection.rb +4 -4
- data/lib/macroape/cli/align_motifs.rb +34 -28
- data/lib/macroape/cli/eval_alignment.rb +73 -47
- data/lib/macroape/cli/eval_similarity.rb +65 -40
- data/lib/macroape/cli/find_pvalue.rb +30 -34
- data/lib/macroape/cli/find_threshold.rb +52 -41
- data/lib/macroape/cli/preprocess_collection.rb +68 -58
- data/lib/macroape/cli/scan_collection.rb +89 -73
- data/lib/macroape/cli.rb +184 -1
- data/lib/macroape/counting.rb +31 -5
- data/lib/macroape/pwm_compare.rb +8 -2
- data/lib/macroape/pwm_compare_aligned.rb +15 -10
- data/lib/macroape/version.rb +2 -1
- data/macroape.gemspec +2 -1
- data/spec/count_distribution_spec.rb +11 -11
- data/test/align_motifs_test.rb +16 -4
- data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
- data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
- data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/KLF4_f2_scan_results_all.txt +1 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
- data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
- data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
- data/test/data/collection_pcm_without_thresholds.yaml +186 -183
- data/test/data/collection_without_thresholds.yaml +186 -183
- data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
- data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
- data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
- data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
- data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
- data/test/data/test_collection.yaml +179 -176
- data/test/data/test_collection_weak.yaml +214 -0
- data/test/eval_alignment_test.rb +97 -21
- data/test/eval_similarity_test.rb +104 -26
- data/test/find_pvalue_test.rb +22 -9
- data/test/find_threshold_test.rb +76 -25
- data/test/preprocess_collection_test.rb +16 -21
- data/test/scan_collection_test.rb +26 -14
- data/test/test_helper.rb +96 -12
- metadata +44 -24
@@ -4,64 +4,66 @@ require 'yaml'
|
|
4
4
|
module Macroape
|
5
5
|
module CLI
|
6
6
|
module ScanCollection
|
7
|
-
|
8
7
|
def self.main(argv)
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
}
|
33
|
-
|
34
|
-
if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
35
|
-
STDERR.puts help_string
|
8
|
+
doc = <<-EOS.strip_doc
|
9
|
+
Command-line format:
|
10
|
+
#{run_tool_cmd} <pat-file> <collection> [options]
|
11
|
+
|
12
|
+
Options:
|
13
|
+
[-p <P-value>]
|
14
|
+
[-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
|
15
|
+
[--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
|
16
|
+
[--silent] - hide current progress information during scan (printed to stderr by default)
|
17
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
18
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
19
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
20
|
+
|
21
|
+
Output format:
|
22
|
+
<name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
|
23
|
+
Attention! Name can contain whitespace characters.
|
24
|
+
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
25
|
+
|
26
|
+
Example:
|
27
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
|
28
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
|
29
|
+
EOS
|
30
|
+
|
31
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
32
|
+
STDERR.puts doc
|
36
33
|
exit
|
37
34
|
end
|
38
35
|
|
39
36
|
data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
|
40
37
|
filename = argv.shift
|
41
38
|
collection_file = argv.shift
|
42
|
-
raise
|
43
|
-
raise
|
39
|
+
raise 'No input. You should specify input file with matrix' unless filename
|
40
|
+
raise 'No input. You should specify input file with collection' unless collection_file
|
44
41
|
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
45
42
|
|
46
43
|
pvalue = 0.0005
|
47
44
|
cutoff = 0.05 # minimal similarity to output
|
48
45
|
collection = YAML.load_file(collection_file)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
46
|
+
collection_background = collection.parameters.background
|
47
|
+
query_background = collection_background
|
48
|
+
|
49
|
+
rough_discretization = collection.parameters.rough_discretization
|
50
|
+
precise_discretization = collection.parameters.precise_discretization
|
51
|
+
max_hash_size = 10000000
|
52
|
+
max_pair_hash_size = 10000
|
53
|
+
pvalue_boundary = :upper
|
54
|
+
|
53
55
|
silent = false
|
54
56
|
precision_mode = :rough
|
55
57
|
until argv.empty?
|
56
58
|
case argv.shift
|
57
|
-
when '-
|
58
|
-
|
59
|
-
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless
|
59
|
+
when '-b'
|
60
|
+
query_background = argv.shift.split(',').map(&:to_f)
|
61
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background == query_background.reverse
|
60
62
|
when '-p'
|
61
63
|
pvalue = argv.shift.to_f
|
62
|
-
when '-
|
63
|
-
max_hash_size = argv.shift.to_i
|
64
|
-
when '-
|
64
|
+
when '--max-hash-size'
|
65
|
+
max_hash_size = argv.shift.to_i
|
66
|
+
when '--max-2d-hash-size'
|
65
67
|
max_pair_hash_size = argv.shift.to_i
|
66
68
|
when '-c'
|
67
69
|
cutoff = argv.shift.to_f
|
@@ -69,6 +71,9 @@ module Macroape
|
|
69
71
|
cutoff = 0.0
|
70
72
|
when '--silent'
|
71
73
|
silent = true
|
74
|
+
when '--boundary'
|
75
|
+
pvalue_boundary = argv.shift.to_sym
|
76
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
72
77
|
when '--precise'
|
73
78
|
precision_mode = :precise
|
74
79
|
begin
|
@@ -81,7 +86,7 @@ module Macroape
|
|
81
86
|
end
|
82
87
|
|
83
88
|
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.parameters.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.parameters.pvalues.include? pvalue
|
84
|
-
|
89
|
+
|
85
90
|
if filename == '.stdin'
|
86
91
|
query_input = $stdin.read
|
87
92
|
else
|
@@ -90,58 +95,69 @@ module Macroape
|
|
90
95
|
end
|
91
96
|
|
92
97
|
query_pwm = data_model.new(query_input).to_pwm
|
93
|
-
query_pwm.set_parameters(background:
|
94
|
-
|
95
|
-
query_pwm_rough = query_pwm.discrete(
|
96
|
-
query_pwm_precise = query_pwm.discrete(
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
98
|
+
query_pwm.set_parameters(background: query_background, max_hash_size: max_hash_size)
|
99
|
+
|
100
|
+
query_pwm_rough = query_pwm.discrete(rough_discretization)
|
101
|
+
query_pwm_precise = query_pwm.discrete(precise_discretization)
|
102
|
+
|
103
|
+
if pvalue_boundary == :lower
|
104
|
+
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
|
105
|
+
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
|
106
|
+
else
|
107
|
+
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.weak_threshold_and_real_pvalue(pvalue)
|
108
|
+
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.weak_threshold_and_real_pvalue(pvalue)
|
109
|
+
end
|
110
|
+
|
101
111
|
if query_precise_real_pvalue == 0
|
102
|
-
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{
|
112
|
+
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
|
103
113
|
return
|
104
114
|
end
|
105
|
-
|
115
|
+
|
106
116
|
if query_rough_real_pvalue == 0
|
107
117
|
query_pwm_rough, query_threshold_rough = query_pwm_precise, query_threshold_precise
|
108
|
-
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{
|
118
|
+
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
|
109
119
|
end
|
110
120
|
|
111
121
|
similarities = {}
|
112
122
|
precision_file_mode = {}
|
113
123
|
|
114
|
-
collection.
|
115
|
-
name =
|
116
|
-
STDERR.puts name unless silent
|
117
|
-
|
118
|
-
if
|
119
|
-
collection_pwm_rough =
|
120
|
-
collection_threshold_rough =
|
124
|
+
collection.each_with_index do |motif, index|
|
125
|
+
name = motif.name
|
126
|
+
STDERR.puts "Testing motif #{name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
|
127
|
+
motif.set_parameters(background: collection_background, max_hash_size: max_hash_size)
|
128
|
+
if motif.rough[pvalue]
|
129
|
+
collection_pwm_rough = motif.pwm.discrete(rough_discretization)
|
130
|
+
collection_threshold_rough = motif.rough[pvalue] * rough_discretization
|
121
131
|
info = Macroape::PWMCompare.new(query_pwm_rough, collection_pwm_rough).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_rough, collection_threshold_rough)
|
122
|
-
|
132
|
+
info[:precision_mode] = :rough
|
123
133
|
end
|
124
|
-
if !
|
125
|
-
collection_pwm_precise =
|
126
|
-
collection_threshold_precise =
|
134
|
+
if !motif.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
|
135
|
+
collection_pwm_precise = motif.pwm.discrete(precise_discretization)
|
136
|
+
collection_threshold_precise = motif.precise[pvalue] * precise_discretization
|
127
137
|
info = Macroape::PWMCompare.new(query_pwm_precise, collection_pwm_precise).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_precise, collection_threshold_precise)
|
128
|
-
|
138
|
+
info[:precision_mode] = :precise
|
129
139
|
end
|
140
|
+
info[:name] = name
|
130
141
|
similarities[name] = info
|
131
142
|
end
|
132
143
|
|
133
|
-
puts "
|
134
|
-
similarities.sort_by do |name, info|
|
135
|
-
info[:similarity]
|
136
|
-
end.reverse.each do |name, info|
|
137
|
-
precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
|
138
|
-
puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
|
139
|
-
end
|
144
|
+
STDERR.puts "100% complete" unless silent
|
140
145
|
|
146
|
+
similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
|
147
|
+
puts Helper.scan_collection_infos_string( similarities_to_output,
|
148
|
+
{cutoff: cutoff,
|
149
|
+
precision_mode: precision_mode,
|
150
|
+
rough_discretization: rough_discretization,
|
151
|
+
precise_discretization: precise_discretization,
|
152
|
+
minimal_similarity: minimal_similarity,
|
153
|
+
pvalue: pvalue,
|
154
|
+
pvalue_boundary: pvalue_boundary,
|
155
|
+
collection_background: collection_background,
|
156
|
+
query_background: query_background} )
|
141
157
|
rescue => err
|
142
|
-
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse
|
158
|
+
STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
143
159
|
end
|
144
|
-
|
160
|
+
|
145
161
|
end
|
146
162
|
end
|
147
163
|
end
|
data/lib/macroape/cli.rb
CHANGED
@@ -1,5 +1,188 @@
|
|
1
|
+
require 'bioinform/support/strip_doc'
|
2
|
+
|
3
|
+
class String
|
4
|
+
def snake_case
|
5
|
+
gsub(/[A-Z]+/){|big| "_#{big.downcase}" }.sub(/^_/,'')
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class Module
|
10
|
+
def run_tool_cmd
|
11
|
+
if Macroape::STANDALONE
|
12
|
+
"ruby #{tool_name}.rb"
|
13
|
+
else
|
14
|
+
tool_name
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def tool_name
|
18
|
+
self.name.split('::').last.snake_case
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
1
22
|
module Macroape
|
2
23
|
module CLI
|
3
|
-
|
24
|
+
class OutputInformation
|
25
|
+
def initialize(data = nil)
|
26
|
+
@table_parameter_descriptions = []
|
27
|
+
|
28
|
+
@parameter_descriptions = []
|
29
|
+
@parameter_value_infos = []
|
30
|
+
|
31
|
+
@resulting_value_descriptions = []
|
32
|
+
@resulting_value_infos = []
|
33
|
+
|
34
|
+
@table_headers = []
|
35
|
+
@table_rows = []
|
36
|
+
@table_rows_callbacks = []
|
37
|
+
@data = data
|
38
|
+
yield self if block_given?
|
39
|
+
end
|
40
|
+
|
41
|
+
def parameters_info
|
42
|
+
[*@parameter_descriptions, *@parameter_value_infos]
|
43
|
+
end
|
44
|
+
def resulting_values_info
|
45
|
+
[*@resulting_value_descriptions, *@resulting_value_infos]
|
46
|
+
end
|
47
|
+
def result
|
48
|
+
[parameters_info, resulting_values_info, resulting_table].reject(&:empty?).map{|b|b.join("\n")}.join("\n#\n")
|
49
|
+
#[*parameters_info, '#', *resulting_values_info, '#', *resulting_table].join("\n")
|
50
|
+
end
|
51
|
+
|
52
|
+
def add_parameter(param_name, description, value, &block)
|
53
|
+
@parameter_descriptions << parameter_description_string(param_name, description)
|
54
|
+
@parameter_value_infos << "# #{param_name} = #{value}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def add_resulting_value(param_name, description, value, &block)
|
58
|
+
@resulting_value_descriptions << parameter_description_string(param_name, description)
|
59
|
+
@resulting_value_infos << "#{param_name}\t#{value}"
|
60
|
+
end
|
61
|
+
|
62
|
+
def add_table_parameter(param_name, description, key_in_hash, &block)
|
63
|
+
@table_parameter_descriptions << parameter_description_string(param_name, description)
|
64
|
+
add_table_parameter_without_description(param_name, key_in_hash, &block)
|
65
|
+
end
|
66
|
+
|
67
|
+
def add_table_parameter_without_description(param_name, key_in_hash, &block)
|
68
|
+
@table_headers << param_name
|
69
|
+
@table_rows << key_in_hash
|
70
|
+
@table_rows_callbacks << block
|
71
|
+
end
|
72
|
+
|
73
|
+
def parameter_description_string(param_name, description)
|
74
|
+
"# #{param_name}: #{description}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def table_content
|
78
|
+
@data.map{|info|
|
79
|
+
@table_rows.zip(@table_rows_callbacks).map{|row,callback| callback ? callback.call(info[row]) : info[row] }.join("\t")
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def header_content
|
84
|
+
'# ' + @table_headers.join("\t")
|
85
|
+
end
|
86
|
+
|
87
|
+
def resulting_table
|
88
|
+
@data ? [*@table_parameter_descriptions, header_content, *table_content] : []
|
89
|
+
end
|
90
|
+
|
91
|
+
# printed only if it is not wordwise [1,1,1,1]
|
92
|
+
def background_parameter(param_name, description, value, &block)
|
93
|
+
add_parameter(param_name, description, value.join(','), &block) unless value == [1,1,1,1]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
module Helper
|
98
|
+
|
99
|
+
def self.similarity_info_string(info)
|
100
|
+
OutputInformation.new { |infos|
|
101
|
+
infos.add_parameter('V', 'discretization', info[:discretization] )
|
102
|
+
infos.add_parameter('P', 'requested P-value', info[:requested_pvalue]) unless info[:predefined_threshold_first] && info[:predefined_threshold_second]
|
103
|
+
infos.add_parameter('T1', 'threshold for the 1st matrix', info[:predefined_threshold_first] ) if info[:predefined_threshold_first]
|
104
|
+
infos.add_parameter('T2', 'threshold for the 2nd matrix', info[:predefined_threshold_second] ) if info[:predefined_threshold_second]
|
105
|
+
infos.add_parameter('PB', 'P-value boundary', info[:pvalue_boundary])
|
106
|
+
if info[:first_background] == info[:second_background]
|
107
|
+
infos.background_parameter('B', 'background', info[:first_background])
|
108
|
+
else
|
109
|
+
infos.background_parameter('B1', 'background for the 1st model', info[:first_background])
|
110
|
+
infos.background_parameter('B2', 'background for the 2nd model', info[:second_background])
|
111
|
+
end
|
112
|
+
|
113
|
+
infos.add_resulting_value('S', 'similarity', info[:similarity])
|
114
|
+
infos.add_resulting_value('D', 'distance (1-similarity)', info[:tanimoto])
|
115
|
+
infos.add_resulting_value('L', 'length of the alignment', info[:alignment_length])
|
116
|
+
infos.add_resulting_value('SH', 'shift of the 2nd PWM relative to the 1st', info[:shift])
|
117
|
+
infos.add_resulting_value('OR', 'orientation of the 2nd PWM relative to the 1st', info[:orientation])
|
118
|
+
infos.add_resulting_value('A1', 'aligned 1st matrix', info[:text].lines.to_a.first.strip )
|
119
|
+
infos.add_resulting_value('A2', 'aligned 2nd matrix', info[:text].lines.to_a.last.strip )
|
120
|
+
infos.add_resulting_value('W', 'number of words recognized by both models (model = PWM + threshold)', info[:recognized_by_both] )
|
121
|
+
infos.add_resulting_value('W1', 'number of words and recognized by the first model', info[:recognized_by_first] )
|
122
|
+
infos.add_resulting_value('P1', 'P-value for the 1st matrix', info[:real_pvalue_first] )
|
123
|
+
infos.add_resulting_value('T1', 'threshold for the 1st matrix', info[:threshold_first] ) unless info[:predefined_threshold_first]
|
124
|
+
infos.add_resulting_value('W2', 'number of words recognized by the 2nd model', info[:recognized_by_second] )
|
125
|
+
infos.add_resulting_value('P2', 'P-value for the 2nd matrix', info[:real_pvalue_second] )
|
126
|
+
infos.add_resulting_value('T2', 'threshold for the 2nd matrix', info[:threshold_second] ) unless info[:predefined_threshold_second]
|
127
|
+
}.result
|
128
|
+
end
|
129
|
+
|
130
|
+
############################################
|
131
|
+
|
132
|
+
def self.threshold_infos_string(data, parameters)
|
133
|
+
OutputInformation.new(data) { |infos|
|
134
|
+
infos.add_parameter('V', 'discretization value', parameters[:discretization])
|
135
|
+
infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
|
136
|
+
infos.background_parameter('B', 'background', parameters[:background])
|
137
|
+
|
138
|
+
infos.add_table_parameter('P', 'requested P-value', :expected_pvalue)
|
139
|
+
infos.add_table_parameter('AP', 'actual P-value', :real_pvalue)
|
140
|
+
infos.add_table_parameter('W', 'number of recognized words', :recognized_words) if parameters[:background] == [1, 1, 1, 1]
|
141
|
+
infos.add_table_parameter('T', 'threshold', :threshold)
|
142
|
+
}.result
|
143
|
+
end
|
144
|
+
|
145
|
+
############################################
|
146
|
+
|
147
|
+
def self.scan_collection_infos_string(data, parameters)
|
148
|
+
OutputInformation.new(data) { |infos|
|
149
|
+
infos.add_parameter('MS', 'minimal similarity to output', parameters[:cutoff])
|
150
|
+
infos.add_parameter('P', 'P-value', parameters[:pvalue])
|
151
|
+
infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
|
152
|
+
if parameters[:precision_mode] == :precise
|
153
|
+
infos.add_parameter('VR', 'discretization value, rough', parameters[:rough_discretization])
|
154
|
+
infos.add_parameter('VP', 'discretization value, precise', parameters[:precise_discretization])
|
155
|
+
infos.add_parameter('MP', 'minimal similarity for the 2nd pass in \'precise\' mode', parameters[:minimal_similarity])
|
156
|
+
else
|
157
|
+
infos.add_parameter('V', 'discretization value', parameters[:rough_discretization])
|
158
|
+
end
|
159
|
+
infos.background_parameter('BQ', 'background for query matrix', parameters[:query_background])
|
160
|
+
infos.background_parameter('BC', 'background for collection', parameters[:collection_background])
|
161
|
+
|
162
|
+
infos.add_table_parameter_without_description('motif', :name)
|
163
|
+
infos.add_table_parameter_without_description('similarity', :similarity)
|
164
|
+
infos.add_table_parameter_without_description('shift', :shift)
|
165
|
+
infos.add_table_parameter_without_description('overlap', :overlap)
|
166
|
+
infos.add_table_parameter_without_description('orientation', :orientation)
|
167
|
+
if parameters[:precision_mode] == :precise
|
168
|
+
infos.add_table_parameter_without_description('precise mode', :precision_mode){|precision| precision == :precise ? '*' : '.' }
|
169
|
+
end
|
170
|
+
}.result
|
171
|
+
end
|
172
|
+
|
173
|
+
############################################
|
174
|
+
|
175
|
+
def self.find_pvalue_info_string(data, parameters)
|
176
|
+
OutputInformation.new(data) {|infos|
|
177
|
+
infos.add_parameter('V', 'discretization value', parameters[:discretization])
|
178
|
+
infos.background_parameter('B', 'background', parameters[:background])
|
179
|
+
|
180
|
+
infos.add_table_parameter('T', 'threshold', :threshold)
|
181
|
+
infos.add_table_parameter('W', 'number of recognized words', :number_of_recognized_words) if parameters[:background] == [1,1,1,1]
|
182
|
+
infos.add_table_parameter('P', 'P-value', :pvalue)
|
183
|
+
}.result
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
4
187
|
end
|
5
188
|
end
|
data/lib/macroape/counting.rb
CHANGED
@@ -4,13 +4,19 @@ module Bioinform
|
|
4
4
|
class PWM
|
5
5
|
# sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
6
6
|
make_parameters :max_hash_size
|
7
|
-
|
7
|
+
|
8
8
|
def threshold(pvalue)
|
9
9
|
thresholds(pvalue){|_, thresh, _| return thresh }
|
10
10
|
end
|
11
11
|
def threshold_and_real_pvalue(pvalue)
|
12
12
|
thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
13
13
|
end
|
14
|
+
def weak_threshold(pvalue)
|
15
|
+
weak_thresholds(pvalue){|_, thresh, _| return thresh }
|
16
|
+
end
|
17
|
+
def weak_threshold_and_real_pvalue(pvalue)
|
18
|
+
weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
|
19
|
+
end
|
14
20
|
|
15
21
|
def thresholds(*pvalues)
|
16
22
|
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
@@ -20,11 +26,26 @@ module Bioinform
|
|
20
26
|
end
|
21
27
|
end
|
22
28
|
|
29
|
+
# "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
|
30
|
+
def weak_thresholds(*pvalues)
|
31
|
+
thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
|
32
|
+
threshold = thresholds.begin.to_f
|
33
|
+
real_pvalue = counts.begin.to_f / vocabulary_volume
|
34
|
+
yield pvalue, threshold, real_pvalue
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
23
39
|
def count_distribution_under_pvalue(max_pvalue)
|
24
40
|
cnt_distribution = {}
|
25
41
|
look_for_count = max_pvalue * vocabulary_volume
|
26
42
|
until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
|
27
|
-
|
43
|
+
begin
|
44
|
+
approximate_threshold = threshold_gauss_estimation(max_pvalue)
|
45
|
+
rescue
|
46
|
+
approximate_threshold = worst_score
|
47
|
+
end
|
48
|
+
cnt_distribution = count_distribution_after_threshold(approximate_threshold)
|
28
49
|
max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
|
29
50
|
end
|
30
51
|
|
@@ -83,13 +104,18 @@ module Bioinform
|
|
83
104
|
|
84
105
|
def counts_by_thresholds(*thresholds)
|
85
106
|
scores = count_distribution_after_threshold(thresholds.min)
|
86
|
-
thresholds.
|
87
|
-
scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
107
|
+
thresholds.inject({}){ |hsh, threshold|
|
108
|
+
hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
|
109
|
+
hsh
|
88
110
|
}
|
89
111
|
end
|
90
112
|
|
113
|
+
def count_by_threshold(threshold)
|
114
|
+
counts_by_thresholds(threshold)[threshold]
|
115
|
+
end
|
116
|
+
|
91
117
|
def pvalue_by_threshold(threshold)
|
92
|
-
|
118
|
+
count_by_threshold(threshold) / vocabulary_volume
|
93
119
|
end
|
94
120
|
end
|
95
121
|
end
|
data/lib/macroape/pwm_compare.rb
CHANGED
@@ -2,7 +2,7 @@ require 'bioinform/support/parameters'
|
|
2
2
|
|
3
3
|
module Macroape
|
4
4
|
class PWMCompare
|
5
|
-
include Parameters
|
5
|
+
include Bioinform::Parameters
|
6
6
|
# sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
7
7
|
make_parameters :max_pair_hash_size
|
8
8
|
|
@@ -18,13 +18,19 @@ module Macroape
|
|
18
18
|
alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
|
19
19
|
end.max_by {|alignment_infos| alignment_infos[:similarity] }
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
def jaccard_by_pvalue(pvalue)
|
23
23
|
threshold_first = first.threshold(pvalue)
|
24
24
|
threshold_second = second.threshold(pvalue)
|
25
25
|
jaccard(threshold_first, threshold_second)
|
26
26
|
end
|
27
27
|
|
28
|
+
def jaccard_by_weak_pvalue(pvalue)
|
29
|
+
threshold_first = first.weak_threshold(pvalue)
|
30
|
+
threshold_second = second.weak_threshold(pvalue)
|
31
|
+
jaccard(threshold_first, threshold_second)
|
32
|
+
end
|
33
|
+
|
28
34
|
def each_alignment
|
29
35
|
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
30
36
|
yield PWMCompareAligned.new(first, second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
|
@@ -1,14 +1,14 @@
|
|
1
1
|
require 'bioinform/support/parameters'
|
2
|
-
require_relative '
|
2
|
+
require_relative 'aligned_pair_intersection'
|
3
3
|
|
4
4
|
module Macroape
|
5
5
|
class PWMCompareAligned
|
6
|
-
include Parameters
|
6
|
+
include Bioinform::Parameters
|
7
7
|
# sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
8
8
|
make_parameters :max_pair_hash_size
|
9
9
|
|
10
10
|
attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
|
11
|
-
|
11
|
+
|
12
12
|
def initialize(first_unaligned, second_unaligned, shift, orientation)
|
13
13
|
@parameters = OpenStruct.new
|
14
14
|
@shift, @orientation = shift, orientation
|
@@ -18,7 +18,7 @@ module Macroape
|
|
18
18
|
|
19
19
|
first, second = first_unaligned, second_unaligned
|
20
20
|
second = second.reverse_complement if revcomp?
|
21
|
-
|
21
|
+
|
22
22
|
if shift > 0
|
23
23
|
second = second.left_augment(shift)
|
24
24
|
else
|
@@ -28,8 +28,6 @@ module Macroape
|
|
28
28
|
@first = first.right_augment(@length - first.length)
|
29
29
|
@second = second.right_augment(@length - second.length)
|
30
30
|
end
|
31
|
-
|
32
|
-
|
33
31
|
|
34
32
|
def direct?
|
35
33
|
orientation == :direct
|
@@ -90,8 +88,8 @@ module Macroape
|
|
90
88
|
end
|
91
89
|
|
92
90
|
def jaccard(first_threshold, second_threshold)
|
93
|
-
f = first.
|
94
|
-
s = second.
|
91
|
+
f = first.count_by_threshold(first_threshold)
|
92
|
+
s = second.count_by_threshold(second_threshold)
|
95
93
|
if f == 0 || s == 0
|
96
94
|
return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
|
97
95
|
recognized_by_first: f,
|
@@ -104,15 +102,22 @@ module Macroape
|
|
104
102
|
union = f + s - intersect
|
105
103
|
similarity = intersect.to_f / union
|
106
104
|
{ similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
|
107
|
-
recognized_by_first: f, recognized_by_second: s
|
105
|
+
recognized_by_first: f, recognized_by_second: s,
|
106
|
+
real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
|
108
107
|
end
|
109
|
-
|
108
|
+
|
110
109
|
def jaccard_by_pvalue(pvalue)
|
111
110
|
threshold_first = first.threshold(pvalue)
|
112
111
|
threshold_second = second.threshold(pvalue)
|
113
112
|
jaccard(threshold_first, threshold_second)
|
114
113
|
end
|
115
114
|
|
115
|
+
def jaccard_by_weak_pvalue(pvalue)
|
116
|
+
threshold_first = first.weak_threshold(pvalue)
|
117
|
+
threshold_second = second.weak_threshold(pvalue)
|
118
|
+
jaccard(threshold_first, threshold_second)
|
119
|
+
end
|
120
|
+
|
116
121
|
def self.calculate_alignment_length(first_len, second_len, shift)
|
117
122
|
if shift > 0
|
118
123
|
[first_len, second_len + shift].max
|
data/lib/macroape/version.rb
CHANGED
data/macroape.gemspec
CHANGED