macroape 4.0.2 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +22 -22
- data/README.md +70 -70
- data/Rakefile.rb +49 -49
- data/TODO.txt +46 -46
- data/benchmark/benchmark_helper.rb +4 -4
- data/benchmark/similarity_benchmark.rb +52 -52
- data/bin/align_motifs +4 -4
- data/bin/eval_alignment +4 -4
- data/bin/eval_similarity +4 -4
- data/bin/find_pvalue +4 -4
- data/bin/find_threshold +4 -4
- data/bin/preprocess_collection +4 -4
- data/bin/scan_collection +4 -4
- data/lib/macroape.rb +14 -11
- data/lib/macroape/aligned_pair_intersection.rb +61 -62
- data/lib/macroape/cli.rb +191 -188
- data/lib/macroape/cli/align_motifs.rb +120 -100
- data/lib/macroape/cli/eval_alignment.rb +157 -156
- data/lib/macroape/cli/eval_similarity.rb +138 -137
- data/lib/macroape/cli/find_pvalue.rb +93 -87
- data/lib/macroape/cli/find_threshold.rb +103 -96
- data/lib/macroape/cli/preprocess_collection.rb +169 -161
- data/lib/macroape/cli/scan_collection.rb +171 -163
- data/lib/macroape/collection.rb +29 -0
- data/lib/macroape/motif_with_thresholds.rb +18 -0
- data/lib/macroape/pwm_compare.rb +39 -44
- data/lib/macroape/pwm_compare_aligned.rb +139 -130
- data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
- data/lib/macroape/support/inverf.rb +13 -0
- data/lib/macroape/support/partial_sums.rb +17 -0
- data/lib/macroape/version.rb +4 -4
- data/macroape.gemspec +19 -19
- data/spec/count_distribution_spec.rb +112 -109
- data/spec/inverf_spec.rb +23 -0
- data/spec/partial_sums_spec.rb +28 -0
- data/spec/spec_helper.rb +11 -11
- data/test/align_motifs_test.rb +42 -43
- data/test/data/AHR_si.pwm +10 -10
- data/test/data/KLF3_f1.pcm +16 -16
- data/test/data/KLF3_f1.pwm +16 -16
- data/test/data/KLF4_f2.pcm +11 -11
- data/test/data/KLF4_f2.pwm +11 -11
- data/test/data/KLF4_f2_scan_results_all.txt +2 -2
- data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
- data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
- data/test/data/SP1_f1.pcm +12 -12
- data/test/data/SP1_f1.pwm +12 -12
- data/test/data/SP1_f1_revcomp.pcm +12 -12
- data/test/data/SP1_f1_revcomp.pwm +12 -12
- data/test/data/medium_motif.pwm +8 -8
- data/test/data/short_motif.pwm +7 -7
- data/test/data/test_collection.yaml +231 -214
- data/test/data/test_collection/GABPA_f1.pwm +14 -14
- data/test/data/test_collection/KLF4_f2.pwm +10 -10
- data/test/data/test_collection/SP1_f1.pwm +12 -12
- data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
- data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
- data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
- data/test/data/test_collection_single_file.txt +38 -38
- data/test/data/test_collection_single_file_pcm.txt +37 -37
- data/test/data/test_collection_weak.yaml +231 -214
- data/test/eval_alignment_test.rb +90 -111
- data/test/eval_similarity_test.rb +105 -123
- data/test/find_pvalue_test.rb +34 -39
- data/test/find_threshold_test.rb +87 -91
- data/test/preprocess_collection_test.rb +56 -65
- data/test/scan_collection_test.rb +42 -48
- data/test/test_helper.rb +159 -160
- metadata +14 -10
- data/test/data/collection_pcm_without_thresholds.yaml +0 -188
- data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,163 +1,171 @@
|
|
1
|
-
require_relative '../../macroape'
|
2
|
-
require 'yaml'
|
3
|
-
|
4
|
-
module Macroape
|
5
|
-
module CLI
|
6
|
-
module ScanCollection
|
7
|
-
def self.main(argv)
|
8
|
-
doc = <<-EOS.strip_doc
|
9
|
-
Command-line format:
|
10
|
-
#{run_tool_cmd} <pat-file> <collection> [options]
|
11
|
-
|
12
|
-
Options:
|
13
|
-
[-p <P-value>]
|
14
|
-
[-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
|
15
|
-
[--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
|
16
|
-
[--silent] - hide current progress information during scan (printed to stderr by default)
|
17
|
-
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
18
|
-
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
19
|
-
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
20
|
-
|
21
|
-
Output format:
|
22
|
-
<name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
|
23
|
-
Attention! Name can contain whitespace characters.
|
24
|
-
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
25
|
-
|
26
|
-
Example:
|
27
|
-
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
|
28
|
-
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
|
29
|
-
EOS
|
30
|
-
|
31
|
-
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
32
|
-
$stderr.puts doc
|
33
|
-
exit
|
34
|
-
end
|
35
|
-
|
36
|
-
data_model = argv.delete('--pcm') ?
|
37
|
-
filename = argv.shift
|
38
|
-
collection_file = argv.shift
|
39
|
-
raise 'No input. You should specify input file with matrix' unless filename
|
40
|
-
raise 'No input. You should specify input file with collection' unless collection_file
|
41
|
-
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
42
|
-
|
43
|
-
pvalue = 0.0005
|
44
|
-
cutoff = 0.05 # minimal similarity to output
|
45
|
-
collection = YAML.load_file(collection_file)
|
46
|
-
collection_background = collection.
|
47
|
-
query_background = collection_background
|
48
|
-
|
49
|
-
rough_discretization = collection.
|
50
|
-
precise_discretization = collection.
|
51
|
-
max_hash_size = 10000000
|
52
|
-
max_pair_hash_size = 10000
|
53
|
-
pvalue_boundary = :upper
|
54
|
-
|
55
|
-
silent = false
|
56
|
-
precision_mode = :rough
|
57
|
-
until argv.empty?
|
58
|
-
case argv.shift
|
59
|
-
when '-b'
|
60
|
-
query_background = argv.shift
|
61
|
-
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background
|
62
|
-
when '-p'
|
63
|
-
pvalue = argv.shift.to_f
|
64
|
-
when '--max-hash-size'
|
65
|
-
max_hash_size = argv.shift.to_i
|
66
|
-
when '--max-2d-hash-size'
|
67
|
-
max_pair_hash_size = argv.shift.to_i
|
68
|
-
when '-c'
|
69
|
-
cutoff = argv.shift.to_f
|
70
|
-
when '--all'
|
71
|
-
cutoff = 0.0
|
72
|
-
when '--silent'
|
73
|
-
silent = true
|
74
|
-
when '--boundary'
|
75
|
-
pvalue_boundary = argv.shift.to_sym
|
76
|
-
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
77
|
-
when '--precise'
|
78
|
-
precision_mode = :precise
|
79
|
-
begin
|
80
|
-
Float(argv.first)
|
81
|
-
minimal_similarity = argv.shift.to_f
|
82
|
-
rescue
|
83
|
-
minimal_similarity = 0.05
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
info = Macroape::PWMCompare.new(
|
138
|
-
info[:precision_mode] = :
|
139
|
-
end
|
140
|
-
info[:
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
1
|
+
require_relative '../../macroape'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Macroape
|
5
|
+
module CLI
|
6
|
+
module ScanCollection
|
7
|
+
def self.main(argv)
|
8
|
+
doc = <<-EOS.strip_doc
|
9
|
+
Command-line format:
|
10
|
+
#{run_tool_cmd} <pat-file> <collection> [options]
|
11
|
+
|
12
|
+
Options:
|
13
|
+
[-p <P-value>]
|
14
|
+
[-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
|
15
|
+
[--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
|
16
|
+
[--silent] - hide current progress information during scan (printed to stderr by default)
|
17
|
+
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
|
18
|
+
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
|
19
|
+
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
|
20
|
+
|
21
|
+
Output format:
|
22
|
+
<name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
|
23
|
+
Attention! Name can contain whitespace characters.
|
24
|
+
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
|
25
|
+
|
26
|
+
Example:
|
27
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
|
28
|
+
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
|
29
|
+
EOS
|
30
|
+
|
31
|
+
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
|
32
|
+
$stderr.puts doc
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
|
36
|
+
data_model = argv.delete('--pcm') ? :pcm : :pwm
|
37
|
+
filename = argv.shift
|
38
|
+
collection_file = argv.shift
|
39
|
+
raise 'No input. You should specify input file with matrix' unless filename
|
40
|
+
raise 'No input. You should specify input file with collection' unless collection_file
|
41
|
+
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
|
42
|
+
|
43
|
+
pvalue = 0.0005
|
44
|
+
cutoff = 0.05 # minimal similarity to output
|
45
|
+
collection = YAML.load_file(collection_file)
|
46
|
+
collection_background = collection.background #(collection.background == [1,1,1,1]) ? Bioinform::Background::Wordwise : Bioinform::Frequencies.new(collection.background)
|
47
|
+
query_background = collection_background
|
48
|
+
|
49
|
+
rough_discretization = collection.rough_discretization
|
50
|
+
precise_discretization = collection.precise_discretization
|
51
|
+
max_hash_size = 10000000
|
52
|
+
max_pair_hash_size = 10000
|
53
|
+
pvalue_boundary = :upper
|
54
|
+
|
55
|
+
silent = false
|
56
|
+
precision_mode = :rough
|
57
|
+
until argv.empty?
|
58
|
+
case argv.shift
|
59
|
+
when '-b'
|
60
|
+
query_background = Bioinform::Background.from_string(argv.shift)
|
61
|
+
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background.symmetric?
|
62
|
+
when '-p'
|
63
|
+
pvalue = argv.shift.to_f
|
64
|
+
when '--max-hash-size'
|
65
|
+
max_hash_size = argv.shift.to_i
|
66
|
+
when '--max-2d-hash-size'
|
67
|
+
max_pair_hash_size = argv.shift.to_i
|
68
|
+
when '-c'
|
69
|
+
cutoff = argv.shift.to_f
|
70
|
+
when '--all'
|
71
|
+
cutoff = 0.0
|
72
|
+
when '--silent'
|
73
|
+
silent = true
|
74
|
+
when '--boundary'
|
75
|
+
pvalue_boundary = argv.shift.to_sym
|
76
|
+
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
|
77
|
+
when '--precise'
|
78
|
+
precision_mode = :precise
|
79
|
+
begin
|
80
|
+
Float(argv.first)
|
81
|
+
minimal_similarity = argv.shift.to_f
|
82
|
+
rescue
|
83
|
+
minimal_similarity = 0.05
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
|
89
|
+
|
90
|
+
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
|
91
|
+
query_input = File.read(filename)
|
92
|
+
|
93
|
+
query_input = Bioinform::MatrixParser.new.parse!(query_input)
|
94
|
+
case data_model
|
95
|
+
when :pcm
|
96
|
+
query_pcm = Bioinform::MotifModel::PCM.new(query_input[:matrix]).named(query_input[:name])
|
97
|
+
query_pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: query_background).convert(query_pcm)
|
98
|
+
when :pwm
|
99
|
+
query_pwm = Bioinform::MotifModel::PWM.new(query_input[:matrix]).named(query_input[:name])
|
100
|
+
end
|
101
|
+
|
102
|
+
query_pwm_rough = query_pwm.discreted(rough_discretization)
|
103
|
+
query_pwm_rough_counting = PWMCounting.new(query_pwm_rough, background: query_background, max_hash_size: max_hash_size)
|
104
|
+
query_pwm_precise = query_pwm.discreted(precise_discretization)
|
105
|
+
query_pwm_precise_counting = PWMCounting.new(query_pwm_precise, background: query_background, max_hash_size: max_hash_size)
|
106
|
+
|
107
|
+
if pvalue_boundary == :lower
|
108
|
+
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.threshold_and_real_pvalue(pvalue)
|
109
|
+
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.threshold_and_real_pvalue(pvalue)
|
110
|
+
else
|
111
|
+
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.weak_threshold_and_real_pvalue(pvalue)
|
112
|
+
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.weak_threshold_and_real_pvalue(pvalue)
|
113
|
+
end
|
114
|
+
|
115
|
+
if query_precise_real_pvalue == 0
|
116
|
+
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
|
117
|
+
return
|
118
|
+
end
|
119
|
+
|
120
|
+
if query_rough_real_pvalue == 0
|
121
|
+
query_pwm_rough_counting, query_threshold_rough = query_pwm_precise_counting, query_threshold_precise
|
122
|
+
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
|
123
|
+
end
|
124
|
+
|
125
|
+
similarities = {}
|
126
|
+
precision_file_mode = {}
|
127
|
+
|
128
|
+
collection.motifs.each_with_index do |motif_info, index|
|
129
|
+
motif = motif_info.model
|
130
|
+
$stderr.puts "Testing motif #{motif.name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
|
131
|
+
|
132
|
+
if motif_info.rough[pvalue]
|
133
|
+
collection_pwm_rough = motif.discreted(rough_discretization)
|
134
|
+
collection_pwm_rough_counting = Macroape::PWMCounting.new(collection_pwm_rough, background: collection_background, max_hash_size: max_hash_size)
|
135
|
+
|
136
|
+
collection_threshold_rough = motif_info.rough[pvalue] * rough_discretization
|
137
|
+
info = Macroape::PWMCompare.new(query_pwm_rough_counting, collection_pwm_rough_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_rough, collection_threshold_rough)
|
138
|
+
info[:precision_mode] = :rough
|
139
|
+
end
|
140
|
+
if !motif_info.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
|
141
|
+
collection_pwm_precise = motif.discreted(precise_discretization)
|
142
|
+
collection_pwm_precise_counting = Macroape::PWMCounting.new(collection_pwm_precise, background: collection_background, max_hash_size: max_hash_size)
|
143
|
+
|
144
|
+
collection_threshold_precise = motif_info.precise[pvalue] * precise_discretization
|
145
|
+
info = Macroape::PWMCompare.new(query_pwm_precise_counting, collection_pwm_precise_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_precise, collection_threshold_precise)
|
146
|
+
info[:precision_mode] = :precise
|
147
|
+
end
|
148
|
+
info[:name] = motif.name
|
149
|
+
similarities[motif.name] = info
|
150
|
+
end
|
151
|
+
|
152
|
+
$stderr.puts "100% complete" unless silent
|
153
|
+
|
154
|
+
similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
|
155
|
+
puts Helper.scan_collection_infos_string( similarities_to_output,
|
156
|
+
{cutoff: cutoff,
|
157
|
+
precision_mode: precision_mode,
|
158
|
+
rough_discretization: rough_discretization,
|
159
|
+
precise_discretization: precise_discretization,
|
160
|
+
minimal_similarity: minimal_similarity,
|
161
|
+
pvalue: pvalue,
|
162
|
+
pvalue_boundary: pvalue_boundary,
|
163
|
+
collection_background: collection_background,
|
164
|
+
query_background: query_background} )
|
165
|
+
rescue => err
|
166
|
+
$stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Macroape
|
2
|
+
class Collection
|
3
|
+
attr_accessor :motifs, :rough_discretization, :precise_discretization, :background, :pvalues
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@motifs = options[:motifs] || []
|
7
|
+
@rough_discretization = options[:rough_discretization]
|
8
|
+
@precise_discretization = options[:precise_discretization]
|
9
|
+
@background = options[:background]
|
10
|
+
@pvalues = options[:pvalues]
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(other)
|
14
|
+
(motifs == other.motifs) &&
|
15
|
+
(rough_discretization == other.rough_discretization) &&
|
16
|
+
(precise_discretization == other.precise_discretization) &&
|
17
|
+
(background == other.background) &&
|
18
|
+
(pvalues == other.pvalues)
|
19
|
+
end
|
20
|
+
|
21
|
+
def <<(motif_with_thresholds)
|
22
|
+
@motifs << motif_with_thresholds
|
23
|
+
end
|
24
|
+
|
25
|
+
def size
|
26
|
+
motifs.size
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Macroape
|
2
|
+
class MotifWithThresholds
|
3
|
+
attr_accessor :model
|
4
|
+
attr_accessor :rough, :precise
|
5
|
+
|
6
|
+
def initialize(model, options = {})
|
7
|
+
@model = model
|
8
|
+
@rough = options[:rough]
|
9
|
+
@precise = options[:precise]
|
10
|
+
end
|
11
|
+
|
12
|
+
def ==(other)
|
13
|
+
(model == other.model) &&
|
14
|
+
(rough == other.rough) &&
|
15
|
+
(precise == other.precise)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/macroape/pwm_compare.rb
CHANGED
@@ -1,44 +1,39 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
include Enumerable
|
41
|
-
alias_method :each, :each_alignment
|
42
|
-
alias_method :map_each_alignment, :map
|
43
|
-
end
|
44
|
-
end
|
1
|
+
module Macroape
|
2
|
+
class PWMCompare
|
3
|
+
# sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
|
4
|
+
attr_accessor :max_pair_hash_size
|
5
|
+
attr_reader :first, :second
|
6
|
+
def initialize(first, second)
|
7
|
+
@first = first
|
8
|
+
@second = second
|
9
|
+
end
|
10
|
+
|
11
|
+
def jaccard(threshold_first, threshold_second)
|
12
|
+
self.map_each_alignment do |alignment|
|
13
|
+
alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
|
14
|
+
end.max_by {|alignment_infos| alignment_infos[:similarity] }
|
15
|
+
end
|
16
|
+
|
17
|
+
def jaccard_by_pvalue(pvalue)
|
18
|
+
threshold_first = first.threshold(pvalue)
|
19
|
+
threshold_second = second.threshold(pvalue)
|
20
|
+
jaccard(threshold_first, threshold_second)
|
21
|
+
end
|
22
|
+
|
23
|
+
def jaccard_by_weak_pvalue(pvalue)
|
24
|
+
threshold_first = first.weak_threshold(pvalue)
|
25
|
+
threshold_second = second.weak_threshold(pvalue)
|
26
|
+
jaccard(threshold_first, threshold_second)
|
27
|
+
end
|
28
|
+
|
29
|
+
def each_alignment
|
30
|
+
(-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
|
31
|
+
yield PWMCompareAligned.new(first, second, shift, orientation).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
include Enumerable
|
36
|
+
alias_method :each, :each_alignment
|
37
|
+
alias_method :map_each_alignment, :map
|
38
|
+
end
|
39
|
+
end
|