macroape 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,163 +1,171 @@
1
- require_relative '../../macroape'
2
- require 'yaml'
3
-
4
- module Macroape
5
- module CLI
6
- module ScanCollection
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> <collection> [options]
11
-
12
- Options:
13
- [-p <P-value>]
14
- [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
15
- [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
16
- [--silent] - hide current progress information during scan (printed to stderr by default)
17
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
18
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
19
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
20
-
21
- Output format:
22
- <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
23
- Attention! Name can contain whitespace characters.
24
- Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
25
-
26
- Example:
27
- #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
28
- #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
29
- EOS
30
-
31
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
32
- $stderr.puts doc
33
- exit
34
- end
35
-
36
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
37
- filename = argv.shift
38
- collection_file = argv.shift
39
- raise 'No input. You should specify input file with matrix' unless filename
40
- raise 'No input. You should specify input file with collection' unless collection_file
41
- raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
42
-
43
- pvalue = 0.0005
44
- cutoff = 0.05 # minimal similarity to output
45
- collection = YAML.load_file(collection_file)
46
- collection_background = collection.parameters.background
47
- query_background = collection_background
48
-
49
- rough_discretization = collection.parameters.rough_discretization
50
- precise_discretization = collection.parameters.precise_discretization
51
- max_hash_size = 10000000
52
- max_pair_hash_size = 10000
53
- pvalue_boundary = :upper
54
-
55
- silent = false
56
- precision_mode = :rough
57
- until argv.empty?
58
- case argv.shift
59
- when '-b'
60
- query_background = argv.shift.split(',').map(&:to_f)
61
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background == query_background.reverse
62
- when '-p'
63
- pvalue = argv.shift.to_f
64
- when '--max-hash-size'
65
- max_hash_size = argv.shift.to_i
66
- when '--max-2d-hash-size'
67
- max_pair_hash_size = argv.shift.to_i
68
- when '-c'
69
- cutoff = argv.shift.to_f
70
- when '--all'
71
- cutoff = 0.0
72
- when '--silent'
73
- silent = true
74
- when '--boundary'
75
- pvalue_boundary = argv.shift.to_sym
76
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
77
- when '--precise'
78
- precision_mode = :precise
79
- begin
80
- Float(argv.first)
81
- minimal_similarity = argv.shift.to_f
82
- rescue
83
- minimal_similarity = 0.05
84
- end
85
- end
86
- end
87
-
88
- raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.parameters.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.parameters.pvalues.include? pvalue
89
-
90
- if filename == '.stdin'
91
- query_input = $stdin.read
92
- else
93
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
94
- query_input = File.read(filename)
95
- end
96
-
97
- query_pwm = data_model.new(query_input).set_parameters(background: query_background).to_pwm
98
- query_pwm.set_parameters(background: query_background, max_hash_size: max_hash_size)
99
-
100
- query_pwm_rough = query_pwm.discrete(rough_discretization)
101
- query_pwm_precise = query_pwm.discrete(precise_discretization)
102
-
103
- if pvalue_boundary == :lower
104
- query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
105
- query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
106
- else
107
- query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.weak_threshold_and_real_pvalue(pvalue)
108
- query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.weak_threshold_and_real_pvalue(pvalue)
109
- end
110
-
111
- if query_precise_real_pvalue == 0
112
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
113
- return
114
- end
115
-
116
- if query_rough_real_pvalue == 0
117
- query_pwm_rough, query_threshold_rough = query_pwm_precise, query_threshold_precise
118
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
119
- end
120
-
121
- similarities = {}
122
- precision_file_mode = {}
123
-
124
- collection.each_with_index do |motif, index|
125
- name = motif.name
126
- $stderr.puts "Testing motif #{name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
127
- motif.set_parameters(background: collection_background, max_hash_size: max_hash_size)
128
- if motif.rough[pvalue]
129
- collection_pwm_rough = motif.pwm.discrete(rough_discretization)
130
- collection_threshold_rough = motif.rough[pvalue] * rough_discretization
131
- info = Macroape::PWMCompare.new(query_pwm_rough, collection_pwm_rough).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_rough, collection_threshold_rough)
132
- info[:precision_mode] = :rough
133
- end
134
- if !motif.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
135
- collection_pwm_precise = motif.pwm.discrete(precise_discretization)
136
- collection_threshold_precise = motif.precise[pvalue] * precise_discretization
137
- info = Macroape::PWMCompare.new(query_pwm_precise, collection_pwm_precise).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_precise, collection_threshold_precise)
138
- info[:precision_mode] = :precise
139
- end
140
- info[:name] = name
141
- similarities[name] = info
142
- end
143
-
144
- $stderr.puts "100% complete" unless silent
145
-
146
- similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
147
- puts Helper.scan_collection_infos_string( similarities_to_output,
148
- {cutoff: cutoff,
149
- precision_mode: precision_mode,
150
- rough_discretization: rough_discretization,
151
- precise_discretization: precise_discretization,
152
- minimal_similarity: minimal_similarity,
153
- pvalue: pvalue,
154
- pvalue_boundary: pvalue_boundary,
155
- collection_background: collection_background,
156
- query_background: query_background} )
157
- rescue => err
158
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
159
- end
160
-
161
- end
162
- end
163
- end
1
+ require_relative '../../macroape'
2
+ require 'yaml'
3
+
4
+ module Macroape
5
+ module CLI
6
+ module ScanCollection
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> <collection> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
15
+ [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
16
+ [--silent] - hide current progress information during scan (printed to stderr by default)
17
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
18
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
19
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
20
+
21
+ Output format:
22
+ <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
23
+ Attention! Name can contain whitespace characters.
24
+ Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
25
+
26
+ Example:
27
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
28
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
29
+ EOS
30
+
31
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
32
+ $stderr.puts doc
33
+ exit
34
+ end
35
+
36
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
37
+ filename = argv.shift
38
+ collection_file = argv.shift
39
+ raise 'No input. You should specify input file with matrix' unless filename
40
+ raise 'No input. You should specify input file with collection' unless collection_file
41
+ raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
42
+
43
+ pvalue = 0.0005
44
+ cutoff = 0.05 # minimal similarity to output
45
+ collection = YAML.load_file(collection_file)
46
+ collection_background = collection.background #(collection.background == [1,1,1,1]) ? Bioinform::Background::Wordwise : Bioinform::Frequencies.new(collection.background)
47
+ query_background = collection_background
48
+
49
+ rough_discretization = collection.rough_discretization
50
+ precise_discretization = collection.precise_discretization
51
+ max_hash_size = 10000000
52
+ max_pair_hash_size = 10000
53
+ pvalue_boundary = :upper
54
+
55
+ silent = false
56
+ precision_mode = :rough
57
+ until argv.empty?
58
+ case argv.shift
59
+ when '-b'
60
+ query_background = Bioinform::Background.from_string(argv.shift)
61
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background.symmetric?
62
+ when '-p'
63
+ pvalue = argv.shift.to_f
64
+ when '--max-hash-size'
65
+ max_hash_size = argv.shift.to_i
66
+ when '--max-2d-hash-size'
67
+ max_pair_hash_size = argv.shift.to_i
68
+ when '-c'
69
+ cutoff = argv.shift.to_f
70
+ when '--all'
71
+ cutoff = 0.0
72
+ when '--silent'
73
+ silent = true
74
+ when '--boundary'
75
+ pvalue_boundary = argv.shift.to_sym
76
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
77
+ when '--precise'
78
+ precision_mode = :precise
79
+ begin
80
+ Float(argv.first)
81
+ minimal_similarity = argv.shift.to_f
82
+ rescue
83
+ minimal_similarity = 0.05
84
+ end
85
+ end
86
+ end
87
+
88
+ raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
89
+
90
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
91
+ query_input = File.read(filename)
92
+
93
+ query_input = Bioinform::MatrixParser.new.parse!(query_input)
94
+ case data_model
95
+ when :pcm
96
+ query_pcm = Bioinform::MotifModel::PCM.new(query_input[:matrix]).named(query_input[:name])
97
+ query_pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: query_background).convert(query_pcm)
98
+ when :pwm
99
+ query_pwm = Bioinform::MotifModel::PWM.new(query_input[:matrix]).named(query_input[:name])
100
+ end
101
+
102
+ query_pwm_rough = query_pwm.discreted(rough_discretization)
103
+ query_pwm_rough_counting = PWMCounting.new(query_pwm_rough, background: query_background, max_hash_size: max_hash_size)
104
+ query_pwm_precise = query_pwm.discreted(precise_discretization)
105
+ query_pwm_precise_counting = PWMCounting.new(query_pwm_precise, background: query_background, max_hash_size: max_hash_size)
106
+
107
+ if pvalue_boundary == :lower
108
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.threshold_and_real_pvalue(pvalue)
109
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.threshold_and_real_pvalue(pvalue)
110
+ else
111
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.weak_threshold_and_real_pvalue(pvalue)
112
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.weak_threshold_and_real_pvalue(pvalue)
113
+ end
114
+
115
+ if query_precise_real_pvalue == 0
116
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
117
+ return
118
+ end
119
+
120
+ if query_rough_real_pvalue == 0
121
+ query_pwm_rough_counting, query_threshold_rough = query_pwm_precise_counting, query_threshold_precise
122
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
123
+ end
124
+
125
+ similarities = {}
126
+ precision_file_mode = {}
127
+
128
+ collection.motifs.each_with_index do |motif_info, index|
129
+ motif = motif_info.model
130
+ $stderr.puts "Testing motif #{motif.name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
131
+
132
+ if motif_info.rough[pvalue]
133
+ collection_pwm_rough = motif.discreted(rough_discretization)
134
+ collection_pwm_rough_counting = Macroape::PWMCounting.new(collection_pwm_rough, background: collection_background, max_hash_size: max_hash_size)
135
+
136
+ collection_threshold_rough = motif_info.rough[pvalue] * rough_discretization
137
+ info = Macroape::PWMCompare.new(query_pwm_rough_counting, collection_pwm_rough_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_rough, collection_threshold_rough)
138
+ info[:precision_mode] = :rough
139
+ end
140
+ if !motif_info.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
141
+ collection_pwm_precise = motif.discreted(precise_discretization)
142
+ collection_pwm_precise_counting = Macroape::PWMCounting.new(collection_pwm_precise, background: collection_background, max_hash_size: max_hash_size)
143
+
144
+ collection_threshold_precise = motif_info.precise[pvalue] * precise_discretization
145
+ info = Macroape::PWMCompare.new(query_pwm_precise_counting, collection_pwm_precise_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_precise, collection_threshold_precise)
146
+ info[:precision_mode] = :precise
147
+ end
148
+ info[:name] = motif.name
149
+ similarities[motif.name] = info
150
+ end
151
+
152
+ $stderr.puts "100% complete" unless silent
153
+
154
+ similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
155
+ puts Helper.scan_collection_infos_string( similarities_to_output,
156
+ {cutoff: cutoff,
157
+ precision_mode: precision_mode,
158
+ rough_discretization: rough_discretization,
159
+ precise_discretization: precise_discretization,
160
+ minimal_similarity: minimal_similarity,
161
+ pvalue: pvalue,
162
+ pvalue_boundary: pvalue_boundary,
163
+ collection_background: collection_background,
164
+ query_background: query_background} )
165
+ rescue => err
166
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,29 @@
1
+ module Macroape
2
+ class Collection
3
+ attr_accessor :motifs, :rough_discretization, :precise_discretization, :background, :pvalues
4
+
5
+ def initialize(options = {})
6
+ @motifs = options[:motifs] || []
7
+ @rough_discretization = options[:rough_discretization]
8
+ @precise_discretization = options[:precise_discretization]
9
+ @background = options[:background]
10
+ @pvalues = options[:pvalues]
11
+ end
12
+
13
+ def ==(other)
14
+ (motifs == other.motifs) &&
15
+ (rough_discretization == other.rough_discretization) &&
16
+ (precise_discretization == other.precise_discretization) &&
17
+ (background == other.background) &&
18
+ (pvalues == other.pvalues)
19
+ end
20
+
21
+ def <<(motif_with_thresholds)
22
+ @motifs << motif_with_thresholds
23
+ end
24
+
25
+ def size
26
+ motifs.size
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,18 @@
1
+ module Macroape
2
+ class MotifWithThresholds
3
+ attr_accessor :model
4
+ attr_accessor :rough, :precise
5
+
6
+ def initialize(model, options = {})
7
+ @model = model
8
+ @rough = options[:rough]
9
+ @precise = options[:precise]
10
+ end
11
+
12
+ def ==(other)
13
+ (model == other.model) &&
14
+ (rough == other.rough) &&
15
+ (precise == other.precise)
16
+ end
17
+ end
18
+ end
@@ -1,44 +1,39 @@
1
- require 'bioinform/support/parameters'
2
-
3
- module Macroape
4
- class PWMCompare
5
- include Bioinform::Parameters
6
- # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
7
- make_parameters :max_pair_hash_size
8
-
9
- attr_reader :first, :second, :parameters
10
- def initialize(first, second)
11
- @parameters = OpenStruct.new
12
- @first = first
13
- @second = second
14
- end
15
-
16
- def jaccard(threshold_first, threshold_second)
17
- self.map_each_alignment do |alignment|
18
- alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
19
- end.max_by {|alignment_infos| alignment_infos[:similarity] }
20
- end
21
-
22
- def jaccard_by_pvalue(pvalue)
23
- threshold_first = first.threshold(pvalue)
24
- threshold_second = second.threshold(pvalue)
25
- jaccard(threshold_first, threshold_second)
26
- end
27
-
28
- def jaccard_by_weak_pvalue(pvalue)
29
- threshold_first = first.weak_threshold(pvalue)
30
- threshold_second = second.weak_threshold(pvalue)
31
- jaccard(threshold_first, threshold_second)
32
- end
33
-
34
- def each_alignment
35
- (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
36
- yield PWMCompareAligned.new(first, second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
37
- end
38
- end
39
-
40
- include Enumerable
41
- alias_method :each, :each_alignment
42
- alias_method :map_each_alignment, :map
43
- end
44
- end
1
+ module Macroape
2
+ class PWMCompare
3
+ # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
4
+ attr_accessor :max_pair_hash_size
5
+ attr_reader :first, :second
6
+ def initialize(first, second)
7
+ @first = first
8
+ @second = second
9
+ end
10
+
11
+ def jaccard(threshold_first, threshold_second)
12
+ self.map_each_alignment do |alignment|
13
+ alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
14
+ end.max_by {|alignment_infos| alignment_infos[:similarity] }
15
+ end
16
+
17
+ def jaccard_by_pvalue(pvalue)
18
+ threshold_first = first.threshold(pvalue)
19
+ threshold_second = second.threshold(pvalue)
20
+ jaccard(threshold_first, threshold_second)
21
+ end
22
+
23
+ def jaccard_by_weak_pvalue(pvalue)
24
+ threshold_first = first.weak_threshold(pvalue)
25
+ threshold_second = second.weak_threshold(pvalue)
26
+ jaccard(threshold_first, threshold_second)
27
+ end
28
+
29
+ def each_alignment
30
+ (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
31
+ yield PWMCompareAligned.new(first, second, shift, orientation).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
32
+ end
33
+ end
34
+
35
+ include Enumerable
36
+ alias_method :each, :each_alignment
37
+ alias_method :map_each_alignment, :map
38
+ end
39
+ end