macroape 3.3.7 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/README.md +2 -2
  2. data/Rakefile.rb +6 -6
  3. data/TODO.txt +23 -3
  4. data/benchmark/similarity_benchmark.rb +18 -18
  5. data/lib/macroape/aligned_pair_intersection.rb +4 -4
  6. data/lib/macroape/cli/align_motifs.rb +34 -28
  7. data/lib/macroape/cli/eval_alignment.rb +73 -47
  8. data/lib/macroape/cli/eval_similarity.rb +65 -40
  9. data/lib/macroape/cli/find_pvalue.rb +30 -34
  10. data/lib/macroape/cli/find_threshold.rb +52 -41
  11. data/lib/macroape/cli/preprocess_collection.rb +68 -58
  12. data/lib/macroape/cli/scan_collection.rb +89 -73
  13. data/lib/macroape/cli.rb +184 -1
  14. data/lib/macroape/counting.rb +31 -5
  15. data/lib/macroape/pwm_compare.rb +8 -2
  16. data/lib/macroape/pwm_compare_aligned.rb +15 -10
  17. data/lib/macroape/version.rb +2 -1
  18. data/macroape.gemspec +2 -1
  19. data/spec/count_distribution_spec.rb +11 -11
  20. data/test/align_motifs_test.rb +16 -4
  21. data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
  22. data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
  23. data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  24. data/test/data/KLF4_f2_scan_results_all.txt +1 -2
  25. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
  26. data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
  27. data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
  28. data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  29. data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
  30. data/test/data/collection_pcm_without_thresholds.yaml +186 -183
  31. data/test/data/collection_without_thresholds.yaml +186 -183
  32. data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
  33. data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
  34. data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
  35. data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  36. data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  37. data/test/data/test_collection.yaml +179 -176
  38. data/test/data/test_collection_weak.yaml +214 -0
  39. data/test/eval_alignment_test.rb +97 -21
  40. data/test/eval_similarity_test.rb +104 -26
  41. data/test/find_pvalue_test.rb +22 -9
  42. data/test/find_threshold_test.rb +76 -25
  43. data/test/preprocess_collection_test.rb +16 -21
  44. data/test/scan_collection_test.rb +26 -14
  45. data/test/test_helper.rb +96 -12
  46. metadata +44 -24
@@ -4,64 +4,66 @@ require 'yaml'
4
4
  module Macroape
5
5
  module CLI
6
6
  module ScanCollection
7
-
8
7
  def self.main(argv)
9
- help_string = %q{
10
- Command-line format:
11
- ruby scan_collection.rb <pat-file> <collection> [options]
12
- or in linux
13
- cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
14
- or on windows
15
- type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
16
-
17
- Options:
18
- [-p <P-value>]
19
- [-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
20
- [--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
21
- [--silent] - don't show current progress information during scan (by default this information's written into stderr)
22
-
23
- Output format:
24
- <name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
25
- Attention! Name can contain whitespace characters.
26
- Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
27
-
28
- Example:
29
- ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
30
- or in linux
31
- cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
32
- }
33
-
34
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
35
- STDERR.puts help_string
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> <collection> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
15
+ [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
16
+ [--silent] - hide current progress information during scan (printed to stderr by default)
17
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
18
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
19
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
20
+
21
+ Output format:
22
+ <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
23
+ Attention! Name can contain whitespace characters.
24
+ Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
25
+
26
+ Example:
27
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
28
+ #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
29
+ EOS
30
+
31
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
32
+ STDERR.puts doc
36
33
  exit
37
34
  end
38
35
 
39
36
  data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
40
37
  filename = argv.shift
41
38
  collection_file = argv.shift
42
- raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
43
- raise "No input. You'd specify input file with collection" unless collection_file
39
+ raise 'No input. You should specify input file with matrix' unless filename
40
+ raise 'No input. You should specify input file with collection' unless collection_file
44
41
  raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
45
42
 
46
43
  pvalue = 0.0005
47
44
  cutoff = 0.05 # minimal similarity to output
48
45
  collection = YAML.load_file(collection_file)
49
- background_query = collection.parameters.background
50
- max_hash_size = 1000000
51
- max_pair_hash_size = 1000
52
-
46
+ collection_background = collection.parameters.background
47
+ query_background = collection_background
48
+
49
+ rough_discretization = collection.parameters.rough_discretization
50
+ precise_discretization = collection.parameters.precise_discretization
51
+ max_hash_size = 10000000
52
+ max_pair_hash_size = 10000
53
+ pvalue_boundary = :upper
54
+
53
55
  silent = false
54
56
  precision_mode = :rough
55
57
  until argv.empty?
56
58
  case argv.shift
57
- when '-bq'
58
- background_query = argv.shift(4).map(&:to_f)
59
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
59
+ when '-b'
60
+ query_background = argv.shift.split(',').map(&:to_f)
61
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background == query_background.reverse
60
62
  when '-p'
61
63
  pvalue = argv.shift.to_f
62
- when '-m'
63
- max_hash_size = argv.shift.to_i
64
- when '-md'
64
+ when '--max-hash-size'
65
+ max_hash_size = argv.shift.to_i
66
+ when '--max-2d-hash-size'
65
67
  max_pair_hash_size = argv.shift.to_i
66
68
  when '-c'
67
69
  cutoff = argv.shift.to_f
@@ -69,6 +71,9 @@ module Macroape
69
71
  cutoff = 0.0
70
72
  when '--silent'
71
73
  silent = true
74
+ when '--boundary'
75
+ pvalue_boundary = argv.shift.to_sym
76
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
72
77
  when '--precise'
73
78
  precision_mode = :precise
74
79
  begin
@@ -81,7 +86,7 @@ module Macroape
81
86
  end
82
87
 
83
88
  raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.parameters.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.parameters.pvalues.include? pvalue
84
-
89
+
85
90
  if filename == '.stdin'
86
91
  query_input = $stdin.read
87
92
  else
@@ -90,58 +95,69 @@ module Macroape
90
95
  end
91
96
 
92
97
  query_pwm = data_model.new(query_input).to_pwm
93
- query_pwm.set_parameters(background: background_query, max_hash_size: max_hash_size)
94
-
95
- query_pwm_rough = query_pwm.discrete(collection.parameters.rough_discretization)
96
- query_pwm_precise = query_pwm.discrete(collection.parameters.precise_discretization)
97
-
98
- query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
99
- query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
100
-
98
+ query_pwm.set_parameters(background: query_background, max_hash_size: max_hash_size)
99
+
100
+ query_pwm_rough = query_pwm.discrete(rough_discretization)
101
+ query_pwm_precise = query_pwm.discrete(precise_discretization)
102
+
103
+ if pvalue_boundary == :lower
104
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
105
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
106
+ else
107
+ query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.weak_threshold_and_real_pvalue(pvalue)
108
+ query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.weak_threshold_and_real_pvalue(pvalue)
109
+ end
110
+
101
111
  if query_precise_real_pvalue == 0
102
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{collection.parameters.precise_discretization}. It's impossible to scan collection for this motif"
112
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
103
113
  return
104
114
  end
105
-
115
+
106
116
  if query_rough_real_pvalue == 0
107
117
  query_pwm_rough, query_threshold_rough = query_pwm_precise, query_threshold_precise
108
- $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{collection.parameters.rough_discretization}. Forcing precise discretization level of #{collection.parameters.precise_discretization}"
118
+ $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
109
119
  end
110
120
 
111
121
  similarities = {}
112
122
  precision_file_mode = {}
113
123
 
114
- collection.each do |collection_pwm, pwm_info|
115
- name = collection_pwm.name
116
- STDERR.puts name unless silent
117
- collection_pwm.set_parameters(background: collection.parameters.background, max_hash_size: max_hash_size)
118
- if pwm_info.rough
119
- collection_pwm_rough = collection_pwm.discrete(collection.parameters.rough_discretization)
120
- collection_threshold_rough = pwm_info.rough[pvalue] * collection.parameters.rough_discretization
124
+ collection.each_with_index do |motif, index|
125
+ name = motif.name
126
+ STDERR.puts "Testing motif #{name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
127
+ motif.set_parameters(background: collection_background, max_hash_size: max_hash_size)
128
+ if motif.rough[pvalue]
129
+ collection_pwm_rough = motif.pwm.discrete(rough_discretization)
130
+ collection_threshold_rough = motif.rough[pvalue] * rough_discretization
121
131
  info = Macroape::PWMCompare.new(query_pwm_rough, collection_pwm_rough).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_rough, collection_threshold_rough)
122
- precision_file_mode[name] = :rough
132
+ info[:precision_mode] = :rough
123
133
  end
124
- if !pwm_info.rough || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
125
- collection_pwm_precise = collection_pwm.discrete(collection.parameters.precise_discretization)
126
- collection_threshold_precise = pwm_info.precise[pvalue] * collection.parameters.precise_discretization
134
+ if !motif.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
135
+ collection_pwm_precise = motif.pwm.discrete(precise_discretization)
136
+ collection_threshold_precise = motif.precise[pvalue] * precise_discretization
127
137
  info = Macroape::PWMCompare.new(query_pwm_precise, collection_pwm_precise).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_precise, collection_threshold_precise)
128
- precision_file_mode[name] = :precise
138
+ info[:precision_mode] = :precise
129
139
  end
140
+ info[:name] = name
130
141
  similarities[name] = info
131
142
  end
132
143
 
133
- puts "#pwm\tsimilarity\tshift\toverlap\torientation"
134
- similarities.sort_by do |name, info|
135
- info[:similarity]
136
- end.reverse.each do |name, info|
137
- precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
138
- puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
139
- end
144
+ STDERR.puts "100% complete" unless silent
140
145
 
146
+ similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
147
+ puts Helper.scan_collection_infos_string( similarities_to_output,
148
+ {cutoff: cutoff,
149
+ precision_mode: precision_mode,
150
+ rough_discretization: rough_discretization,
151
+ precise_discretization: precise_discretization,
152
+ minimal_similarity: minimal_similarity,
153
+ pvalue: pvalue,
154
+ pvalue_boundary: pvalue_boundary,
155
+ collection_background: collection_background,
156
+ query_background: query_background} )
141
157
  rescue => err
142
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
158
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
143
159
  end
144
-
160
+
145
161
  end
146
162
  end
147
163
  end
data/lib/macroape/cli.rb CHANGED
@@ -1,5 +1,188 @@
1
+ require 'bioinform/support/strip_doc'
2
+
3
+ class String
4
+ def snake_case
5
+ gsub(/[A-Z]+/){|big| "_#{big.downcase}" }.sub(/^_/,'')
6
+ end
7
+ end
8
+
9
+ class Module
10
+ def run_tool_cmd
11
+ if Macroape::STANDALONE
12
+ "ruby #{tool_name}.rb"
13
+ else
14
+ tool_name
15
+ end
16
+ end
17
+ def tool_name
18
+ self.name.split('::').last.snake_case
19
+ end
20
+ end
21
+
1
22
  module Macroape
2
23
  module CLI
3
-
24
+ class OutputInformation
25
+ def initialize(data = nil)
26
+ @table_parameter_descriptions = []
27
+
28
+ @parameter_descriptions = []
29
+ @parameter_value_infos = []
30
+
31
+ @resulting_value_descriptions = []
32
+ @resulting_value_infos = []
33
+
34
+ @table_headers = []
35
+ @table_rows = []
36
+ @table_rows_callbacks = []
37
+ @data = data
38
+ yield self if block_given?
39
+ end
40
+
41
+ def parameters_info
42
+ [*@parameter_descriptions, *@parameter_value_infos]
43
+ end
44
+ def resulting_values_info
45
+ [*@resulting_value_descriptions, *@resulting_value_infos]
46
+ end
47
+ def result
48
+ [parameters_info, resulting_values_info, resulting_table].reject(&:empty?).map{|b|b.join("\n")}.join("\n#\n")
49
+ #[*parameters_info, '#', *resulting_values_info, '#', *resulting_table].join("\n")
50
+ end
51
+
52
+ def add_parameter(param_name, description, value, &block)
53
+ @parameter_descriptions << parameter_description_string(param_name, description)
54
+ @parameter_value_infos << "# #{param_name} = #{value}"
55
+ end
56
+
57
+ def add_resulting_value(param_name, description, value, &block)
58
+ @resulting_value_descriptions << parameter_description_string(param_name, description)
59
+ @resulting_value_infos << "#{param_name}\t#{value}"
60
+ end
61
+
62
+ def add_table_parameter(param_name, description, key_in_hash, &block)
63
+ @table_parameter_descriptions << parameter_description_string(param_name, description)
64
+ add_table_parameter_without_description(param_name, key_in_hash, &block)
65
+ end
66
+
67
+ def add_table_parameter_without_description(param_name, key_in_hash, &block)
68
+ @table_headers << param_name
69
+ @table_rows << key_in_hash
70
+ @table_rows_callbacks << block
71
+ end
72
+
73
+ def parameter_description_string(param_name, description)
74
+ "# #{param_name}: #{description}"
75
+ end
76
+
77
+ def table_content
78
+ @data.map{|info|
79
+ @table_rows.zip(@table_rows_callbacks).map{|row,callback| callback ? callback.call(info[row]) : info[row] }.join("\t")
80
+ }
81
+ end
82
+
83
+ def header_content
84
+ '# ' + @table_headers.join("\t")
85
+ end
86
+
87
+ def resulting_table
88
+ @data ? [*@table_parameter_descriptions, header_content, *table_content] : []
89
+ end
90
+
91
+ # printed only if it is not wordwise [1,1,1,1]
92
+ def background_parameter(param_name, description, value, &block)
93
+ add_parameter(param_name, description, value.join(','), &block) unless value == [1,1,1,1]
94
+ end
95
+ end
96
+
97
+ module Helper
98
+
99
+ def self.similarity_info_string(info)
100
+ OutputInformation.new { |infos|
101
+ infos.add_parameter('V', 'discretization', info[:discretization] )
102
+ infos.add_parameter('P', 'requested P-value', info[:requested_pvalue]) unless info[:predefined_threshold_first] && info[:predefined_threshold_second]
103
+ infos.add_parameter('T1', 'threshold for the 1st matrix', info[:predefined_threshold_first] ) if info[:predefined_threshold_first]
104
+ infos.add_parameter('T2', 'threshold for the 2nd matrix', info[:predefined_threshold_second] ) if info[:predefined_threshold_second]
105
+ infos.add_parameter('PB', 'P-value boundary', info[:pvalue_boundary])
106
+ if info[:first_background] == info[:second_background]
107
+ infos.background_parameter('B', 'background', info[:first_background])
108
+ else
109
+ infos.background_parameter('B1', 'background for the 1st model', info[:first_background])
110
+ infos.background_parameter('B2', 'background for the 2nd model', info[:second_background])
111
+ end
112
+
113
+ infos.add_resulting_value('S', 'similarity', info[:similarity])
114
+ infos.add_resulting_value('D', 'distance (1-similarity)', info[:tanimoto])
115
+ infos.add_resulting_value('L', 'length of the alignment', info[:alignment_length])
116
+ infos.add_resulting_value('SH', 'shift of the 2nd PWM relative to the 1st', info[:shift])
117
+ infos.add_resulting_value('OR', 'orientation of the 2nd PWM relative to the 1st', info[:orientation])
118
+ infos.add_resulting_value('A1', 'aligned 1st matrix', info[:text].lines.to_a.first.strip )
119
+ infos.add_resulting_value('A2', 'aligned 2nd matrix', info[:text].lines.to_a.last.strip )
120
+ infos.add_resulting_value('W', 'number of words recognized by both models (model = PWM + threshold)', info[:recognized_by_both] )
121
+ infos.add_resulting_value('W1', 'number of words and recognized by the first model', info[:recognized_by_first] )
122
+ infos.add_resulting_value('P1', 'P-value for the 1st matrix', info[:real_pvalue_first] )
123
+ infos.add_resulting_value('T1', 'threshold for the 1st matrix', info[:threshold_first] ) unless info[:predefined_threshold_first]
124
+ infos.add_resulting_value('W2', 'number of words recognized by the 2nd model', info[:recognized_by_second] )
125
+ infos.add_resulting_value('P2', 'P-value for the 2nd matrix', info[:real_pvalue_second] )
126
+ infos.add_resulting_value('T2', 'threshold for the 2nd matrix', info[:threshold_second] ) unless info[:predefined_threshold_second]
127
+ }.result
128
+ end
129
+
130
+ ############################################
131
+
132
+ def self.threshold_infos_string(data, parameters)
133
+ OutputInformation.new(data) { |infos|
134
+ infos.add_parameter('V', 'discretization value', parameters[:discretization])
135
+ infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
136
+ infos.background_parameter('B', 'background', parameters[:background])
137
+
138
+ infos.add_table_parameter('P', 'requested P-value', :expected_pvalue)
139
+ infos.add_table_parameter('AP', 'actual P-value', :real_pvalue)
140
+ infos.add_table_parameter('W', 'number of recognized words', :recognized_words) if parameters[:background] == [1, 1, 1, 1]
141
+ infos.add_table_parameter('T', 'threshold', :threshold)
142
+ }.result
143
+ end
144
+
145
+ ############################################
146
+
147
+ def self.scan_collection_infos_string(data, parameters)
148
+ OutputInformation.new(data) { |infos|
149
+ infos.add_parameter('MS', 'minimal similarity to output', parameters[:cutoff])
150
+ infos.add_parameter('P', 'P-value', parameters[:pvalue])
151
+ infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
152
+ if parameters[:precision_mode] == :precise
153
+ infos.add_parameter('VR', 'discretization value, rough', parameters[:rough_discretization])
154
+ infos.add_parameter('VP', 'discretization value, precise', parameters[:precise_discretization])
155
+ infos.add_parameter('MP', 'minimal similarity for the 2nd pass in \'precise\' mode', parameters[:minimal_similarity])
156
+ else
157
+ infos.add_parameter('V', 'discretization value', parameters[:rough_discretization])
158
+ end
159
+ infos.background_parameter('BQ', 'background for query matrix', parameters[:query_background])
160
+ infos.background_parameter('BC', 'background for collection', parameters[:collection_background])
161
+
162
+ infos.add_table_parameter_without_description('motif', :name)
163
+ infos.add_table_parameter_without_description('similarity', :similarity)
164
+ infos.add_table_parameter_without_description('shift', :shift)
165
+ infos.add_table_parameter_without_description('overlap', :overlap)
166
+ infos.add_table_parameter_without_description('orientation', :orientation)
167
+ if parameters[:precision_mode] == :precise
168
+ infos.add_table_parameter_without_description('precise mode', :precision_mode){|precision| precision == :precise ? '*' : '.' }
169
+ end
170
+ }.result
171
+ end
172
+
173
+ ############################################
174
+
175
+ def self.find_pvalue_info_string(data, parameters)
176
+ OutputInformation.new(data) {|infos|
177
+ infos.add_parameter('V', 'discretization value', parameters[:discretization])
178
+ infos.background_parameter('B', 'background', parameters[:background])
179
+
180
+ infos.add_table_parameter('T', 'threshold', :threshold)
181
+ infos.add_table_parameter('W', 'number of recognized words', :number_of_recognized_words) if parameters[:background] == [1,1,1,1]
182
+ infos.add_table_parameter('P', 'P-value', :pvalue)
183
+ }.result
184
+ end
185
+
186
+ end
4
187
  end
5
188
  end
@@ -4,13 +4,19 @@ module Bioinform
4
4
  class PWM
5
5
  # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
6
6
  make_parameters :max_hash_size
7
-
7
+
8
8
  def threshold(pvalue)
9
9
  thresholds(pvalue){|_, thresh, _| return thresh }
10
10
  end
11
11
  def threshold_and_real_pvalue(pvalue)
12
12
  thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
13
13
  end
14
+ def weak_threshold(pvalue)
15
+ weak_thresholds(pvalue){|_, thresh, _| return thresh }
16
+ end
17
+ def weak_threshold_and_real_pvalue(pvalue)
18
+ weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
19
+ end
14
20
 
15
21
  def thresholds(*pvalues)
16
22
  thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
@@ -20,11 +26,26 @@ module Bioinform
20
26
  end
21
27
  end
22
28
 
29
+ # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
30
+ def weak_thresholds(*pvalues)
31
+ thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
32
+ threshold = thresholds.begin.to_f
33
+ real_pvalue = counts.begin.to_f / vocabulary_volume
34
+ yield pvalue, threshold, real_pvalue
35
+ end
36
+ end
37
+
38
+
23
39
  def count_distribution_under_pvalue(max_pvalue)
24
40
  cnt_distribution = {}
25
41
  look_for_count = max_pvalue * vocabulary_volume
26
42
  until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
27
- cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
43
+ begin
44
+ approximate_threshold = threshold_gauss_estimation(max_pvalue)
45
+ rescue
46
+ approximate_threshold = worst_score
47
+ end
48
+ cnt_distribution = count_distribution_after_threshold(approximate_threshold)
28
49
  max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
29
50
  end
30
51
 
@@ -83,13 +104,18 @@ module Bioinform
83
104
 
84
105
  def counts_by_thresholds(*thresholds)
85
106
  scores = count_distribution_after_threshold(thresholds.min)
86
- thresholds.map{ |threshold|
87
- scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
107
+ thresholds.inject({}){ |hsh, threshold|
108
+ hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
109
+ hsh
88
110
  }
89
111
  end
90
112
 
113
+ def count_by_threshold(threshold)
114
+ counts_by_thresholds(threshold)[threshold]
115
+ end
116
+
91
117
  def pvalue_by_threshold(threshold)
92
- counts_by_thresholds(threshold).first / vocabulary_volume
118
+ count_by_threshold(threshold) / vocabulary_volume
93
119
  end
94
120
  end
95
121
  end
@@ -2,7 +2,7 @@ require 'bioinform/support/parameters'
2
2
 
3
3
  module Macroape
4
4
  class PWMCompare
5
- include Parameters
5
+ include Bioinform::Parameters
6
6
  # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
7
7
  make_parameters :max_pair_hash_size
8
8
 
@@ -18,13 +18,19 @@ module Macroape
18
18
  alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
19
19
  end.max_by {|alignment_infos| alignment_infos[:similarity] }
20
20
  end
21
-
21
+
22
22
  def jaccard_by_pvalue(pvalue)
23
23
  threshold_first = first.threshold(pvalue)
24
24
  threshold_second = second.threshold(pvalue)
25
25
  jaccard(threshold_first, threshold_second)
26
26
  end
27
27
 
28
+ def jaccard_by_weak_pvalue(pvalue)
29
+ threshold_first = first.weak_threshold(pvalue)
30
+ threshold_second = second.weak_threshold(pvalue)
31
+ jaccard(threshold_first, threshold_second)
32
+ end
33
+
28
34
  def each_alignment
29
35
  (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
30
36
  yield PWMCompareAligned.new(first, second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
@@ -1,14 +1,14 @@
1
1
  require 'bioinform/support/parameters'
2
- require_relative './aligned_pair_intersection'
2
+ require_relative 'aligned_pair_intersection'
3
3
 
4
4
  module Macroape
5
5
  class PWMCompareAligned
6
- include Parameters
6
+ include Bioinform::Parameters
7
7
  # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
8
8
  make_parameters :max_pair_hash_size
9
9
 
10
10
  attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
11
-
11
+
12
12
  def initialize(first_unaligned, second_unaligned, shift, orientation)
13
13
  @parameters = OpenStruct.new
14
14
  @shift, @orientation = shift, orientation
@@ -18,7 +18,7 @@ module Macroape
18
18
 
19
19
  first, second = first_unaligned, second_unaligned
20
20
  second = second.reverse_complement if revcomp?
21
-
21
+
22
22
  if shift > 0
23
23
  second = second.left_augment(shift)
24
24
  else
@@ -28,8 +28,6 @@ module Macroape
28
28
  @first = first.right_augment(@length - first.length)
29
29
  @second = second.right_augment(@length - second.length)
30
30
  end
31
-
32
-
33
31
 
34
32
  def direct?
35
33
  orientation == :direct
@@ -90,8 +88,8 @@ module Macroape
90
88
  end
91
89
 
92
90
  def jaccard(first_threshold, second_threshold)
93
- f = first.counts_by_thresholds(first_threshold).first
94
- s = second.counts_by_thresholds(second_threshold).first
91
+ f = first.count_by_threshold(first_threshold)
92
+ s = second.count_by_threshold(second_threshold)
95
93
  if f == 0 || s == 0
96
94
  return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
97
95
  recognized_by_first: f,
@@ -104,15 +102,22 @@ module Macroape
104
102
  union = f + s - intersect
105
103
  similarity = intersect.to_f / union
106
104
  { similarity: similarity, tanimoto: 1.0 - similarity, recognized_by_both: intersect,
107
- recognized_by_first: f, recognized_by_second: s }
105
+ recognized_by_first: f, recognized_by_second: s,
106
+ real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
108
107
  end
109
-
108
+
110
109
  def jaccard_by_pvalue(pvalue)
111
110
  threshold_first = first.threshold(pvalue)
112
111
  threshold_second = second.threshold(pvalue)
113
112
  jaccard(threshold_first, threshold_second)
114
113
  end
115
114
 
115
+ def jaccard_by_weak_pvalue(pvalue)
116
+ threshold_first = first.weak_threshold(pvalue)
117
+ threshold_second = second.weak_threshold(pvalue)
118
+ jaccard(threshold_first, threshold_second)
119
+ end
120
+
116
121
  def self.calculate_alignment_length(first_len, second_len, shift)
117
122
  if shift > 0
118
123
  [first_len, second_len + shift].max
@@ -1,3 +1,4 @@
1
1
  module Macroape
2
- VERSION = "3.3.7"
2
+ VERSION = "3.3.8"
3
+ STANDALONE = false
3
4
  end
data/macroape.gemspec CHANGED
@@ -15,5 +15,6 @@ Gem::Specification.new do |gem|
15
15
  gem.require_paths = ["lib"]
16
16
  gem.version = Macroape::VERSION
17
17
 
18
- gem.add_dependency('bioinform', '= 0.1.8')
18
+ gem.add_dependency('bioinform', '= 0.1.9')
19
+ gem.add_dependency('docopt', '= 0.5.0')
19
20
  end