macroape 4.0.2 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,163 +1,171 @@
1
- require_relative '../../macroape'
2
- require 'yaml'
3
-
4
- module Macroape
5
- module CLI
6
- module ScanCollection
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> <collection> [options]
11
-
12
- Options:
13
- [-p <P-value>]
14
- [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
15
- [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
16
- [--silent] - hide current progress information during scan (printed to stderr by default)
17
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
18
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
19
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
20
-
21
- Output format:
22
- <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
23
- Attention! Name can contain whitespace characters.
24
- Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
25
-
26
- Example:
27
- #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
28
- #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
29
- EOS
30
-
31
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
32
- $stderr.puts doc
33
- exit
34
- end
35
-
36
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
37
- filename = argv.shift
38
- collection_file = argv.shift
39
- raise 'No input. You should specify input file with matrix' unless filename
40
- raise 'No input. You should specify input file with collection' unless collection_file
41
- raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
42
-
43
- pvalue = 0.0005
44
- cutoff = 0.05 # minimal similarity to output
45
- collection = YAML.load_file(collection_file)
46
- collection_background = collection.parameters.background
47
- query_background = collection_background
48
-
49
- rough_discretization = collection.parameters.rough_discretization
50
- precise_discretization = collection.parameters.precise_discretization
51
- max_hash_size = 10000000
52
- max_pair_hash_size = 10000
53
- pvalue_boundary = :upper
54
-
55
- silent = false
56
- precision_mode = :rough
57
- until argv.empty?
58
- case argv.shift
59
- when '-b'
60
- query_background = argv.shift.split(',').map(&:to_f)
61
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background == query_background.reverse
62
- when '-p'
63
- pvalue = argv.shift.to_f
64
- when '--max-hash-size'
65
- max_hash_size = argv.shift.to_i
66
- when '--max-2d-hash-size'
67
- max_pair_hash_size = argv.shift.to_i
68
- when '-c'
69
- cutoff = argv.shift.to_f
70
- when '--all'
71
- cutoff = 0.0
72
- when '--silent'
73
- silent = true
74
- when '--boundary'
75
- pvalue_boundary = argv.shift.to_sym
76
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
77
- when '--precise'
78
- precision_mode = :precise
79
- begin
80
- Float(argv.first)
81
- minimal_similarity = argv.shift.to_f
82
- rescue
83
- minimal_similarity = 0.05
84
- end
85
- end
86
- end
87
-
88
- raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.parameters.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.parameters.pvalues.include? pvalue
89
-
90
- if filename == '.stdin'
91
- query_input = $stdin.read
92
- else
93
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
94
- query_input = File.read(filename)
95
- end
96
-
97
- query_pwm = data_model.new(query_input).set_parameters(background: query_background).to_pwm
98
- query_pwm.set_parameters(background: query_background, max_hash_size: max_hash_size)
99
-
100
- query_pwm_rough = query_pwm.discrete(rough_discretization)
101
- query_pwm_precise = query_pwm.discrete(precise_discretization)
102
-
103
- if pvalue_boundary == :lower
104
- query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
105
- query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
106
- else
107
- query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.weak_threshold_and_real_pvalue(pvalue)
108
- query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.weak_threshold_and_real_pvalue(pvalue)
109
- end
110
-
111
- if query_precise_real_pvalue == 0
112
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
113
- return
114
- end
115
-
116
- if query_rough_real_pvalue == 0
117
- query_pwm_rough, query_threshold_rough = query_pwm_precise, query_threshold_precise
118
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
119
- end
120
-
121
- similarities = {}
122
- precision_file_mode = {}
123
-
124
- collection.each_with_index do |motif, index|
125
- name = motif.name
126
- $stderr.puts "Testing motif #{name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
127
- motif.set_parameters(background: collection_background, max_hash_size: max_hash_size)
128
- if motif.rough[pvalue]
129
- collection_pwm_rough = motif.pwm.discrete(rough_discretization)
130
- collection_threshold_rough = motif.rough[pvalue] * rough_discretization
131
- info = Macroape::PWMCompare.new(query_pwm_rough, collection_pwm_rough).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_rough, collection_threshold_rough)
132
- info[:precision_mode] = :rough
133
- end
134
- if !motif.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
135
- collection_pwm_precise = motif.pwm.discrete(precise_discretization)
136
- collection_threshold_precise = motif.precise[pvalue] * precise_discretization
137
- info = Macroape::PWMCompare.new(query_pwm_precise, collection_pwm_precise).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_precise, collection_threshold_precise)
138
- info[:precision_mode] = :precise
139
- end
140
- info[:name] = name
141
- similarities[name] = info
142
- end
143
-
144
- $stderr.puts "100% complete" unless silent
145
-
146
- similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
147
- puts Helper.scan_collection_infos_string( similarities_to_output,
148
- {cutoff: cutoff,
149
- precision_mode: precision_mode,
150
- rough_discretization: rough_discretization,
151
- precise_discretization: precise_discretization,
152
- minimal_similarity: minimal_similarity,
153
- pvalue: pvalue,
154
- pvalue_boundary: pvalue_boundary,
155
- collection_background: collection_background,
156
- query_background: query_background} )
157
- rescue => err
158
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
159
- end
160
-
161
- end
162
- end
163
- end
1
+ require_relative '../../macroape'
2
+ require 'yaml'
3
+
4
+ module Macroape
5
+ module CLI
6
+ module ScanCollection
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> <collection> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
15
+ [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
16
+ [--silent] - hide current progress information during scan (printed to stderr by default)
17
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
18
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
19
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
20
+
21
+ Output format:
22
+ <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
23
+ Attention! Name can contain whitespace characters.
24
+ Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
25
+
26
+ Example:
27
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
28
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
29
+ EOS
30
+
31
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
32
+ $stderr.puts doc
33
+ exit
34
+ end
35
+
36
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
37
+ filename = argv.shift
38
+ collection_file = argv.shift
39
+ raise 'No input. You should specify input file with matrix' unless filename
40
+ raise 'No input. You should specify input file with collection' unless collection_file
41
+ raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
42
+
43
+ pvalue = 0.0005
44
+ cutoff = 0.05 # minimal similarity to output
45
+ collection = YAML.load_file(collection_file)
46
+ collection_background = collection.background #(collection.background == [1,1,1,1]) ? Bioinform::Background::Wordwise : Bioinform::Frequencies.new(collection.background)
47
+ query_background = collection_background
48
+
49
+ rough_discretization = collection.rough_discretization
50
+ precise_discretization = collection.precise_discretization
51
+ max_hash_size = 10000000
52
+ max_pair_hash_size = 10000
53
+ pvalue_boundary = :upper
54
+
55
+ silent = false
56
+ precision_mode = :rough
57
+ until argv.empty?
58
+ case argv.shift
59
+ when '-b'
60
+ query_background = Bioinform::Background.from_string(argv.shift)
61
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background.symmetric?
62
+ when '-p'
63
+ pvalue = argv.shift.to_f
64
+ when '--max-hash-size'
65
+ max_hash_size = argv.shift.to_i
66
+ when '--max-2d-hash-size'
67
+ max_pair_hash_size = argv.shift.to_i
68
+ when '-c'
69
+ cutoff = argv.shift.to_f
70
+ when '--all'
71
+ cutoff = 0.0
72
+ when '--silent'
73
+ silent = true
74
+ when '--boundary'
75
+ pvalue_boundary = argv.shift.to_sym
76
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
77
+ when '--precise'
78
+ precision_mode = :precise
79
+ begin
80
+ Float(argv.first)
81
+ minimal_similarity = argv.shift.to_f
82
+ rescue
83
+ minimal_similarity = 0.05
84
+ end
85
+ end
86
+ end
87
+
88
+ raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
89
+
90
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
91
+ query_input = File.read(filename)
92
+
93
+ query_input = Bioinform::MatrixParser.new.parse!(query_input)
94
+ case data_model
95
+ when :pcm
96
+ query_pcm = Bioinform::MotifModel::PCM.new(query_input[:matrix]).named(query_input[:name])
97
+ query_pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: query_background).convert(query_pcm)
98
+ when :pwm
99
+ query_pwm = Bioinform::MotifModel::PWM.new(query_input[:matrix]).named(query_input[:name])
100
+ end
101
+
102
+ query_pwm_rough = query_pwm.discreted(rough_discretization)
103
+ query_pwm_rough_counting = PWMCounting.new(query_pwm_rough, background: query_background, max_hash_size: max_hash_size)
104
+ query_pwm_precise = query_pwm.discreted(precise_discretization)
105
+ query_pwm_precise_counting = PWMCounting.new(query_pwm_precise, background: query_background, max_hash_size: max_hash_size)
106
+
107
+ if pvalue_boundary == :lower
108
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.threshold_and_real_pvalue(pvalue)
109
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.threshold_and_real_pvalue(pvalue)
110
+ else
111
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.weak_threshold_and_real_pvalue(pvalue)
112
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.weak_threshold_and_real_pvalue(pvalue)
113
+ end
114
+
115
+ if query_precise_real_pvalue == 0
116
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
117
+ return
118
+ end
119
+
120
+ if query_rough_real_pvalue == 0
121
+ query_pwm_rough_counting, query_threshold_rough = query_pwm_precise_counting, query_threshold_precise
122
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
123
+ end
124
+
125
+ similarities = {}
126
+ precision_file_mode = {}
127
+
128
+ collection.motifs.each_with_index do |motif_info, index|
129
+ motif = motif_info.model
130
+ $stderr.puts "Testing motif #{motif.name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
131
+
132
+ if motif_info.rough[pvalue]
133
+ collection_pwm_rough = motif.discreted(rough_discretization)
134
+ collection_pwm_rough_counting = Macroape::PWMCounting.new(collection_pwm_rough, background: collection_background, max_hash_size: max_hash_size)
135
+
136
+ collection_threshold_rough = motif_info.rough[pvalue] * rough_discretization
137
+ info = Macroape::PWMCompare.new(query_pwm_rough_counting, collection_pwm_rough_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_rough, collection_threshold_rough)
138
+ info[:precision_mode] = :rough
139
+ end
140
+ if !motif_info.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
141
+ collection_pwm_precise = motif.discreted(precise_discretization)
142
+ collection_pwm_precise_counting = Macroape::PWMCounting.new(collection_pwm_precise, background: collection_background, max_hash_size: max_hash_size)
143
+
144
+ collection_threshold_precise = motif_info.precise[pvalue] * precise_discretization
145
+ info = Macroape::PWMCompare.new(query_pwm_precise_counting, collection_pwm_precise_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_precise, collection_threshold_precise)
146
+ info[:precision_mode] = :precise
147
+ end
148
+ info[:name] = motif.name
149
+ similarities[motif.name] = info
150
+ end
151
+
152
+ $stderr.puts "100% complete" unless silent
153
+
154
+ similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
155
+ puts Helper.scan_collection_infos_string( similarities_to_output,
156
+ {cutoff: cutoff,
157
+ precision_mode: precision_mode,
158
+ rough_discretization: rough_discretization,
159
+ precise_discretization: precise_discretization,
160
+ minimal_similarity: minimal_similarity,
161
+ pvalue: pvalue,
162
+ pvalue_boundary: pvalue_boundary,
163
+ collection_background: collection_background,
164
+ query_background: query_background} )
165
+ rescue => err
166
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,29 @@
1
+ module Macroape
2
+ class Collection
3
+ attr_accessor :motifs, :rough_discretization, :precise_discretization, :background, :pvalues
4
+
5
+ def initialize(options = {})
6
+ @motifs = options[:motifs] || []
7
+ @rough_discretization = options[:rough_discretization]
8
+ @precise_discretization = options[:precise_discretization]
9
+ @background = options[:background]
10
+ @pvalues = options[:pvalues]
11
+ end
12
+
13
+ def ==(other)
14
+ (motifs == other.motifs) &&
15
+ (rough_discretization == other.rough_discretization) &&
16
+ (precise_discretization == other.precise_discretization) &&
17
+ (background == other.background) &&
18
+ (pvalues == other.pvalues)
19
+ end
20
+
21
+ def <<(motif_with_thresholds)
22
+ @motifs << motif_with_thresholds
23
+ end
24
+
25
+ def size
26
+ motifs.size
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,18 @@
1
+ module Macroape
2
+ class MotifWithThresholds
3
+ attr_accessor :model
4
+ attr_accessor :rough, :precise
5
+
6
+ def initialize(model, options = {})
7
+ @model = model
8
+ @rough = options[:rough]
9
+ @precise = options[:precise]
10
+ end
11
+
12
+ def ==(other)
13
+ (model == other.model) &&
14
+ (rough == other.rough) &&
15
+ (precise == other.precise)
16
+ end
17
+ end
18
+ end
@@ -1,44 +1,39 @@
1
- require 'bioinform/support/parameters'
2
-
3
- module Macroape
4
- class PWMCompare
5
- include Bioinform::Parameters
6
- # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
7
- make_parameters :max_pair_hash_size
8
-
9
- attr_reader :first, :second, :parameters
10
- def initialize(first, second)
11
- @parameters = OpenStruct.new
12
- @first = first
13
- @second = second
14
- end
15
-
16
- def jaccard(threshold_first, threshold_second)
17
- self.map_each_alignment do |alignment|
18
- alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
19
- end.max_by {|alignment_infos| alignment_infos[:similarity] }
20
- end
21
-
22
- def jaccard_by_pvalue(pvalue)
23
- threshold_first = first.threshold(pvalue)
24
- threshold_second = second.threshold(pvalue)
25
- jaccard(threshold_first, threshold_second)
26
- end
27
-
28
- def jaccard_by_weak_pvalue(pvalue)
29
- threshold_first = first.weak_threshold(pvalue)
30
- threshold_second = second.weak_threshold(pvalue)
31
- jaccard(threshold_first, threshold_second)
32
- end
33
-
34
- def each_alignment
35
- (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
36
- yield PWMCompareAligned.new(first, second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
37
- end
38
- end
39
-
40
- include Enumerable
41
- alias_method :each, :each_alignment
42
- alias_method :map_each_alignment, :map
43
- end
44
- end
1
+ module Macroape
2
+ class PWMCompare
3
+ # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
4
+ attr_accessor :max_pair_hash_size
5
+ attr_reader :first, :second
6
+ def initialize(first, second)
7
+ @first = first
8
+ @second = second
9
+ end
10
+
11
+ def jaccard(threshold_first, threshold_second)
12
+ self.map_each_alignment do |alignment|
13
+ alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
14
+ end.max_by {|alignment_infos| alignment_infos[:similarity] }
15
+ end
16
+
17
+ def jaccard_by_pvalue(pvalue)
18
+ threshold_first = first.threshold(pvalue)
19
+ threshold_second = second.threshold(pvalue)
20
+ jaccard(threshold_first, threshold_second)
21
+ end
22
+
23
+ def jaccard_by_weak_pvalue(pvalue)
24
+ threshold_first = first.weak_threshold(pvalue)
25
+ threshold_second = second.weak_threshold(pvalue)
26
+ jaccard(threshold_first, threshold_second)
27
+ end
28
+
29
+ def each_alignment
30
+ (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
31
+ yield PWMCompareAligned.new(first, second, shift, orientation).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
32
+ end
33
+ end
34
+
35
+ include Enumerable
36
+ alias_method :each, :each_alignment
37
+ alias_method :map_each_alignment, :map
38
+ end
39
+ end