macroape 3.3.7 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/README.md +2 -2
  2. data/Rakefile.rb +6 -6
  3. data/TODO.txt +23 -3
  4. data/benchmark/similarity_benchmark.rb +18 -18
  5. data/lib/macroape/aligned_pair_intersection.rb +4 -4
  6. data/lib/macroape/cli/align_motifs.rb +34 -28
  7. data/lib/macroape/cli/eval_alignment.rb +73 -47
  8. data/lib/macroape/cli/eval_similarity.rb +65 -40
  9. data/lib/macroape/cli/find_pvalue.rb +30 -34
  10. data/lib/macroape/cli/find_threshold.rb +52 -41
  11. data/lib/macroape/cli/preprocess_collection.rb +68 -58
  12. data/lib/macroape/cli/scan_collection.rb +89 -73
  13. data/lib/macroape/cli.rb +184 -1
  14. data/lib/macroape/counting.rb +31 -5
  15. data/lib/macroape/pwm_compare.rb +8 -2
  16. data/lib/macroape/pwm_compare_aligned.rb +15 -10
  17. data/lib/macroape/version.rb +2 -1
  18. data/macroape.gemspec +2 -1
  19. data/spec/count_distribution_spec.rb +11 -11
  20. data/test/align_motifs_test.rb +16 -4
  21. data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
  22. data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
  23. data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  24. data/test/data/KLF4_f2_scan_results_all.txt +1 -2
  25. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
  26. data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
  27. data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
  28. data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  29. data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
  30. data/test/data/collection_pcm_without_thresholds.yaml +186 -183
  31. data/test/data/collection_without_thresholds.yaml +186 -183
  32. data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
  33. data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
  34. data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
  35. data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  36. data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  37. data/test/data/test_collection.yaml +179 -176
  38. data/test/data/test_collection_weak.yaml +214 -0
  39. data/test/eval_alignment_test.rb +97 -21
  40. data/test/eval_similarity_test.rb +104 -26
  41. data/test/find_pvalue_test.rb +22 -9
  42. data/test/find_threshold_test.rb +76 -25
  43. data/test/preprocess_collection_test.rb +16 -21
  44. data/test/scan_collection_test.rb +26 -14
  45. data/test/test_helper.rb +96 -12
  46. metadata +44 -24
@@ -3,43 +3,31 @@ require_relative '../../macroape'
3
3
  module Macroape
4
4
  module CLI
5
5
  module FindPValue
6
-
6
+
7
7
  def self.main(argv)
8
- help_string = %q{
8
+ doc = <<-EOS.strip_doc
9
9
  Command-line format:
10
- ruby find_pvalue.rb <pat-file> <threshold list> [options]
11
- or in linux
12
- cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
13
- or on windows
14
- type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
10
+ #{run_tool_cmd} <pat-file> <threshold list>... [options]
15
11
 
16
12
  Options:
17
13
  [-d <discretization level>]
18
- [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
19
-
20
- Output format:
21
- threshold_1 count_1 pvalue_1
22
- threshold_2 count_2 pvalue_2
23
- threshold_3 count_3 pvalue_3
24
- The results are printed out in the same order as in the given threshold list.
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
25
16
 
26
17
  Examples:
27
- ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
28
- or on windows
29
- type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
30
- or in linux
31
- cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
32
- }
33
-
34
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
35
- STDERR.puts help_string
18
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
20
+ EOS
21
+
22
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
23
+ STDERR.puts doc
36
24
  exit
37
25
  end
38
26
 
39
27
  discretization = 10000
40
28
  background = [1,1,1,1]
41
29
  thresholds = []
42
- max_hash_size = 1000000
30
+ max_hash_size = 10000000
43
31
 
44
32
  data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
45
33
  filename = argv.shift
@@ -53,21 +41,21 @@ module Macroape
53
41
  end
54
42
  end
55
43
 
56
- raise "No input. You'd specify input source: filename or .stdin" unless filename
44
+ raise 'No input. You should specify input file' unless filename
57
45
  raise 'You should specify at least one threshold' if thresholds.empty?
58
46
 
59
47
  until argv.empty?
60
48
  case argv.shift
61
49
  when '-b'
62
- background = argv.shift(4).map(&:to_f)
50
+ background = argv.shift.split(',').map(&:to_f)
63
51
  when '-d'
64
52
  discretization = argv.shift.to_f
65
- when '-m'
53
+ when '--max-hash-size'
66
54
  max_hash_size = argv.shift.to_i
67
55
  end
68
56
  end
69
57
 
70
-
58
+
71
59
  if filename == '.stdin'
72
60
  input = $stdin.read
73
61
  else
@@ -78,14 +66,22 @@ module Macroape
78
66
  pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
79
67
 
80
68
  counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
81
- pvalues = counts.map{|count| count.to_f / pwm.vocabulary_volume}
82
- pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
83
- puts "#{threshold}\t#{count}\t#{pvalue}"
84
- }
69
+ infos = []
70
+ thresholds.each do |threshold|
71
+ count = counts[threshold * discretization]
72
+ pvalue = count.to_f / pwm.vocabulary_volume
73
+ infos << {threshold: threshold,
74
+ number_of_recognized_words: count,
75
+ pvalue: pvalue}
76
+ end
77
+
78
+ puts Helper.find_pvalue_info_string( infos,
79
+ {discretization: discretization,
80
+ background: background} )
85
81
  rescue => err
86
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
82
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
87
83
  end
88
-
84
+
89
85
  end
90
86
  end
91
87
  end
@@ -3,65 +3,64 @@ require_relative '../../macroape'
3
3
  module Macroape
4
4
  module CLI
5
5
  module FindThreshold
6
-
6
+
7
7
  def self.main(argv)
8
- help_string = %q{
9
- Command-line format::
10
- ruby find_threshold.rb <pat-file> [options]
11
- or in linux
12
- cat <pat-file> | ruby find_threshold.rb .stdin [options]
13
- or on windows
14
- type <pat-file> | ruby find_threshold.rb .stdin [options]
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
15
11
 
16
12
  Options:
17
- [-p <list of P-values>]
18
13
  [-d <discretization level>]
19
- [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
20
-
21
- Output format:
22
- requested_pvalue_1 threshold_1 achieved_pvalue_1
23
- requested_pvalue_2 threshold_2 achieved_pvalue_2
24
-
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
16
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
25
17
 
26
18
  Example:
27
- ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
28
- }
29
-
30
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
31
- STDERR.puts help_string
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat
20
+ #{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
21
+ EOS
22
+
23
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
24
+ STDERR.puts doc
32
25
  exit
33
26
  end
34
-
27
+
35
28
  background = [1,1,1,1]
36
29
  default_pvalues = [0.0005]
37
30
  discretization = 10000
38
- max_hash_size = 1000000
39
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
31
+ max_hash_size = 10000000
32
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
33
+
34
+ pvalue_boundary = :lower
35
+
40
36
 
41
37
  filename = argv.shift
42
- raise "No input. You'd specify input source: filename or .stdin" unless filename
38
+ raise 'No input. You should specify input file' unless filename
43
39
 
44
40
  pvalues = []
41
+ loop do
42
+ begin
43
+ Float(argv.first)
44
+ pvalues << argv.shift.to_f
45
+ rescue
46
+ raise StopIteration
47
+ end
48
+ end
49
+ pvalues = default_pvalues if pvalues.empty?
50
+
45
51
  until argv.empty?
46
52
  case argv.shift
47
53
  when '-b'
48
- background = argv.shift(4).map(&:to_f)
49
- when '-m'
54
+ background = argv.shift.split(',').map(&:to_f)
55
+ when '--max-hash-size'
50
56
  max_hash_size = argv.shift.to_i
51
- when '-p'
52
- loop do
53
- begin
54
- Float(argv.first)
55
- pvalues << argv.shift.to_f
56
- rescue
57
- raise StopIteration
58
- end
59
- end
60
57
  when '-d'
61
58
  discretization = argv.shift.to_f
59
+ when '--boundary'
60
+ pvalue_boundary = argv.shift.to_sym
61
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
62
62
  end
63
63
  end
64
- pvalues = default_pvalues if pvalues.empty?
65
64
 
66
65
  if filename == '.stdin'
67
66
  input = $stdin.read
@@ -72,14 +71,26 @@ module Macroape
72
71
  pwm = data_model.new(input).to_pwm
73
72
  pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
74
73
 
75
- pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
76
- puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
74
+ infos = []
75
+ collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
76
+ infos << {expected_pvalue: pvalue,
77
+ threshold: threshold / discretization,
78
+ real_pvalue: real_pvalue,
79
+ recognized_words: pwm.vocabulary_volume * real_pvalue }
77
80
  end
78
-
81
+ if pvalue_boundary == :lower
82
+ pwm.thresholds(*pvalues, &collect_infos_proc)
83
+ else
84
+ pwm.weak_thresholds(*pvalues, &collect_infos_proc)
85
+ end
86
+ puts Helper.threshold_infos_string(infos,
87
+ {discretization: discretization,
88
+ background: background,
89
+ pvalue_boundary: pvalue_boundary} )
79
90
  rescue => err
80
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
91
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
81
92
  end
82
-
93
+
83
94
  end
84
95
  end
85
96
  end
@@ -5,87 +5,76 @@ require 'shellwords'
5
5
  module Macroape
6
6
  module CLI
7
7
  module PreprocessCollection
8
-
8
+
9
9
  def self.main(argv)
10
- help_string = %q{
11
- Command-line format:
12
- ruby preprocess_collection.rb <file or folder with PWMs or .stdin with filenames> [options]
13
-
14
- Options:
15
- [-p <list of P-values>]
16
- [-d <rough discretization> <precise discretization>]
17
- [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
18
- [-o <output file>]
19
- [-n <name>] - specify name for a collection. Default filename is based on this parameter
20
- [--silent] - don't show current progress information during scan (by default this information's written into stderr)
21
- [--pcm] - treats your input motifs as PCM-s. Motifs are converted to PWMs internally so output is the same as for according PWMs
22
-
23
- The tool stores preprocessed Macroape collection to the specified YAML-file.
24
-
25
- Example:
26
- ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
27
- }
28
-
29
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
30
- STDERR.puts help_string
10
+ doc = <<-EOS.strip_doc
11
+ Command-line format:
12
+ #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
13
+
14
+ Options:
15
+ [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
16
+ [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
17
+ [--silent] - hide current progress information during scan (printed to stderr by default)
18
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
19
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
20
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
21
+
22
+ The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
23
+
24
+ Example:
25
+ #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
26
+ EOS
27
+
28
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
29
+ STDERR.puts doc
31
30
  exit
32
31
  end
33
32
 
34
33
  data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
35
-
34
+
36
35
  default_pvalues = [0.0005]
37
36
  background = [1,1,1,1]
38
37
  rough_discretization = 1
39
38
  precise_discretization = 10
40
- output_file = 'collection.yaml'
41
- max_hash_size = 1000000
42
-
39
+ max_hash_size = 10000000
40
+
43
41
  data_source = argv.shift
44
-
45
- raise "No input. You'd specify file or folder with pwms" unless data_source
42
+ output_file = argv.shift
43
+
44
+ raise 'No input. You should specify file or folder with pwms' unless data_source
46
45
  raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
46
+ raise 'You should specify output file' unless output_file
47
47
 
48
48
  pvalues = []
49
49
  silent = false
50
- output_file_specified = false
50
+ pvalue_boundary = :upper
51
+
51
52
  until argv.empty?
52
53
  case argv.shift
53
54
  when '-b'
54
- background = argv.shift(4).map(&:to_f)
55
+ background = argv.shift.split(',').map(&:to_f)
55
56
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
56
57
  when '-p'
57
- loop do
58
- begin
59
- Float(argv.first)
60
- pvalues << argv.shift.to_f
61
- rescue
62
- raise StopIteration
63
- end
64
- end
58
+ pvalues = argv.shift.split(',').map(&:to_f)
65
59
  when '-d'
66
- rough_discretization, precise_discretization = argv.shift(2).map(&:to_f).sort
67
- when '-o'
68
- output_file = argv.shift
69
- output_file_specified = true
70
- when '-m'
60
+ rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
61
+ when '--max-hash-size'
71
62
  max_hash_size = argv.shift.to_i
72
- when '-n'
73
- collection_name = argv.shift
74
63
  when '--silent'
75
64
  silent = true
65
+ when '--boundary'
66
+ pvalue_boundary = argv.shift.to_sym
67
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
76
68
  end
77
69
  end
78
70
  pvalues = default_pvalues if pvalues.empty?
79
71
 
80
- collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
72
+ collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
81
73
  precise_discretization: precise_discretization,
82
74
  background: background,
83
75
  pvalues: pvalues)
84
- if collection_name
85
- collection.name = collection_name
86
- output_file = "#{collection_name}.yaml" if !output_file_specified
87
- end
88
-
76
+
77
+ data_source = data_source.gsub("\\",'/')
89
78
  if File.directory?(data_source)
90
79
  motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
91
80
  pwm = data_model.new(File.read(filename))
@@ -106,12 +95,12 @@ module Macroape
106
95
  else
107
96
  raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
108
97
  end
109
-
98
+
110
99
  pwms = motifs.map(&:to_pwm)
111
-
100
+
112
101
  pwms.each_with_index do |pwm,index|
113
- STDERR.puts "#{index + 1} -- Name: #{pwm.name}, Length: #{pwm.length}" unless silent
114
-
102
+ STDERR.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
103
+
115
104
  # When support of onefile collections is introduced - then here should be check if name exists.
116
105
  # Otherwise it should skip motif and tell you about this
117
106
  # Also two command line options to fail on skipping or to skip silently should be included
@@ -120,7 +109,8 @@ module Macroape
120
109
  pwm.set_parameters(background: background, max_hash_size: max_hash_size)
121
110
  skip_motif = false
122
111
 
123
- pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
112
+
113
+ fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
124
114
  if real_pvalue == 0
125
115
  $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
126
116
  else
@@ -128,7 +118,7 @@ module Macroape
128
118
  end
129
119
  end
130
120
 
131
- pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
121
+ fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
132
122
  if real_pvalue == 0
133
123
  $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
134
124
  skip_motif = true
@@ -136,13 +126,33 @@ module Macroape
136
126
  info.precise[pvalue] = threshold / precise_discretization
137
127
  end
138
128
  end
129
+
130
+ if pvalue_boundary == :lower
131
+ pwm.discrete(rough_discretization).thresholds(*pvalues, &fill_rough_infos)
132
+ else
133
+ pwm.discrete(rough_discretization).weak_thresholds(*pvalues, &fill_rough_infos)
134
+ end
135
+
136
+ if pvalue_boundary == :lower
137
+ pwm.discrete(precise_discretization).thresholds(*pvalues, &fill_precise_infos)
138
+ else
139
+ pwm.discrete(precise_discretization).weak_thresholds(*pvalues,&fill_precise_infos)
140
+ end
139
141
  collection.add_pm(pwm, info) unless skip_motif
140
142
  end
143
+ STDERR.puts "100% complete. Saving results" unless silent
141
144
  File.open(output_file, 'w') do |f|
142
145
  f.puts(collection.to_yaml)
143
146
  end
147
+ puts OutputInformation.new{|infos|
148
+ infos.add_parameter('P', 'P-value list', pvalues.join(','))
149
+ infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
150
+ infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
151
+ infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
152
+ infos.background_parameter('B', 'background', background)
153
+ }.result
144
154
  rescue => err
145
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
155
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
146
156
  end
147
157
 
148
158
  end