macroape 4.0.2 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,96 +1,103 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module FindThreshold
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
11
-
12
- Options:
13
- [-d <discretization level>]
14
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
- [--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
16
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
17
-
18
- Example:
19
- #{run_tool_cmd} motifs/KLF4_f2.pat
20
- #{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
21
- EOS
22
-
23
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
24
- $stderr.puts doc
25
- exit
26
- end
27
-
28
- background = [1,1,1,1]
29
- default_pvalues = [0.0005]
30
- discretization = 10000
31
- max_hash_size = 10000000
32
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
33
-
34
- pvalue_boundary = :lower
35
-
36
-
37
- filename = argv.shift
38
- raise 'No input. You should specify input file' unless filename
39
-
40
- pvalues = []
41
- loop do
42
- begin
43
- Float(argv.first)
44
- pvalues << argv.shift.to_f
45
- rescue
46
- raise StopIteration
47
- end
48
- end
49
- pvalues = default_pvalues if pvalues.empty?
50
-
51
- until argv.empty?
52
- case argv.shift
53
- when '-b'
54
- background = argv.shift.split(',').map(&:to_f)
55
- when '--max-hash-size'
56
- max_hash_size = argv.shift.to_i
57
- when '-d'
58
- discretization = argv.shift.to_f
59
- when '--boundary'
60
- pvalue_boundary = argv.shift.to_sym
61
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
62
- end
63
- end
64
-
65
- if filename == '.stdin'
66
- input = $stdin.read
67
- else
68
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
- input = File.read(filename)
70
- end
71
- pwm = data_model.new(input).set_parameters(background: background).to_pwm
72
- pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
73
-
74
- infos = []
75
- collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
76
- infos << {expected_pvalue: pvalue,
77
- threshold: threshold / discretization,
78
- real_pvalue: real_pvalue,
79
- recognized_words: pwm.vocabulary_volume * real_pvalue }
80
- end
81
- if pvalue_boundary == :lower
82
- pwm.thresholds(*pvalues, &collect_infos_proc)
83
- else
84
- pwm.weak_thresholds(*pvalues, &collect_infos_proc)
85
- end
86
- puts Helper.threshold_infos_string(infos,
87
- {discretization: discretization,
88
- background: background,
89
- pvalue_boundary: pvalue_boundary} )
90
- rescue => err
91
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
92
- end
93
-
94
- end
95
- end
96
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindThreshold
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
11
+
12
+ Options:
13
+ [-d <discretization level>]
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
16
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
17
+
18
+ Example:
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat
20
+ #{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
21
+ EOS
22
+
23
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
24
+ $stderr.puts doc
25
+ exit
26
+ end
27
+
28
+ background = Bioinform::Background::Wordwise
29
+ default_pvalues = [0.0005]
30
+ discretization = 10000
31
+ max_hash_size = 10000000
32
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
33
+
34
+ pvalue_boundary = :lower
35
+
36
+
37
+ filename = argv.shift
38
+ raise 'No input. You should specify input file' unless filename
39
+
40
+ pvalues = []
41
+ loop do
42
+ begin
43
+ Float(argv.first)
44
+ pvalues << argv.shift.to_f
45
+ rescue
46
+ raise StopIteration
47
+ end
48
+ end
49
+ pvalues = default_pvalues if pvalues.empty?
50
+
51
+ until argv.empty?
52
+ case argv.shift
53
+ when '-b'
54
+ background = Bioinform::Background.from_string(argv.shift)
55
+ when '--max-hash-size'
56
+ max_hash_size = argv.shift.to_i
57
+ when '-d'
58
+ discretization = argv.shift.to_f
59
+ when '--boundary'
60
+ pvalue_boundary = argv.shift.to_sym
61
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
62
+ end
63
+ end
64
+
65
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
66
+ input = File.read(filename)
67
+
68
+ parser = Bioinform::MatrixParser.new
69
+ motif_data = parser.parse!(input)
70
+ case data_model
71
+ when :pcm
72
+ pcm = Bioinform::MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
73
+ pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background).convert(pcm)
74
+ when :pwm
75
+ pwm = Bioinform::MotifModel::PWM.new(motif_data[:matrix]).named(motif_data[:name])
76
+ end
77
+
78
+ pwm = pwm.discreted(discretization)
79
+ counting = PWMCounting.new(pwm, background: background, max_hash_size: max_hash_size)
80
+
81
+ infos = []
82
+ collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
83
+ infos << {expected_pvalue: pvalue,
84
+ threshold: threshold / discretization,
85
+ real_pvalue: real_pvalue,
86
+ recognized_words: real_pvalue * counting.vocabulary_volume }
87
+ end
88
+ if pvalue_boundary == :lower
89
+ counting.thresholds(*pvalues, &collect_infos_proc)
90
+ else
91
+ counting.weak_thresholds(*pvalues, &collect_infos_proc)
92
+ end
93
+ puts Helper.threshold_infos_string(infos,
94
+ {discretization: discretization,
95
+ background: background,
96
+ pvalue_boundary: pvalue_boundary} )
97
+ rescue => err
98
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
99
+ end
100
+
101
+ end
102
+ end
103
+ end
@@ -1,161 +1,169 @@
1
- require_relative '../../macroape'
2
- require 'yaml'
3
- require 'shellwords'
4
-
5
- module Macroape
6
- module CLI
7
- module PreprocessCollection
8
-
9
- def self.main(argv)
10
- doc = <<-EOS.strip_doc
11
- Command-line format:
12
- #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
13
-
14
- Options:
15
- [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
16
- [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
17
- [--silent] - hide current progress information during scan (printed to stderr by default)
18
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
19
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
20
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
21
-
22
- The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
23
-
24
- Example:
25
- #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
26
- EOS
27
-
28
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
29
- $stderr.puts doc
30
- exit
31
- end
32
-
33
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
34
-
35
- default_pvalues = [0.0005]
36
- background = [1,1,1,1]
37
- rough_discretization = 1
38
- precise_discretization = 10
39
- max_hash_size = 10000000
40
-
41
- data_source = argv.shift
42
- output_file = argv.shift
43
-
44
- raise 'No input. You should specify file or folder with pwms' unless data_source
45
- raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
46
- raise 'You should specify output file' unless output_file
47
-
48
- pvalues = []
49
- silent = false
50
- pvalue_boundary = :upper
51
-
52
- until argv.empty?
53
- case argv.shift
54
- when '-b'
55
- background = argv.shift.split(',').map(&:to_f)
56
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
57
- when '-p'
58
- pvalues = argv.shift.split(',').map(&:to_f)
59
- when '-d'
60
- rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
61
- when '--max-hash-size'
62
- max_hash_size = argv.shift.to_i
63
- when '--silent'
64
- silent = true
65
- when '--boundary'
66
- pvalue_boundary = argv.shift.to_sym
67
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
68
- end
69
- end
70
- pvalues = default_pvalues if pvalues.empty?
71
-
72
- collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
73
- precise_discretization: precise_discretization,
74
- background: background,
75
- pvalues: pvalues)
76
-
77
- data_source = data_source.gsub("\\",'/')
78
- if File.directory?(data_source)
79
- motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
80
- pwm = data_model.new(File.read(filename))
81
- pwm.name ||= File.basename(filename, File.extname(filename))
82
- pwm
83
- end
84
- elsif File.file?(data_source)
85
- input = File.read(data_source)
86
- motifs = data_model.split_on_motifs(input)
87
- elsif data_source == '.stdin'
88
- filelist = $stdin.read.shellsplit
89
- motifs = []
90
- filelist.each do |filename|
91
- motif = data_model.new(File.read(filename))
92
- motif.name ||= File.basename(filename, File.extname(filename))
93
- motif.set_parameters(background: background)
94
- motifs << motif
95
- end
96
- else
97
- raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
98
- end
99
-
100
- pwms = motifs.map(&:to_pwm)
101
-
102
- pwms.each_with_index do |pwm,index|
103
- $stderr.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
104
-
105
- # When support of onefile collections is introduced - then here should be check if name exists.
106
- # Otherwise it should skip motif and tell you about this
107
- # Also two command line options to fail on skipping or to skip silently should be included
108
-
109
- info = OpenStruct.new(rough: {}, precise: {})
110
- pwm.set_parameters(background: background, max_hash_size: max_hash_size)
111
- skip_motif = false
112
-
113
-
114
- fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
115
- if real_pvalue == 0
116
- $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
117
- else
118
- info.rough[pvalue] = threshold / rough_discretization
119
- end
120
- end
121
-
122
- fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
123
- if real_pvalue == 0
124
- $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
125
- skip_motif = true
126
- else
127
- info.precise[pvalue] = threshold / precise_discretization
128
- end
129
- end
130
-
131
- if pvalue_boundary == :lower
132
- pwm.discrete(rough_discretization).thresholds(*pvalues, &fill_rough_infos)
133
- else
134
- pwm.discrete(rough_discretization).weak_thresholds(*pvalues, &fill_rough_infos)
135
- end
136
-
137
- if pvalue_boundary == :lower
138
- pwm.discrete(precise_discretization).thresholds(*pvalues, &fill_precise_infos)
139
- else
140
- pwm.discrete(precise_discretization).weak_thresholds(*pvalues,&fill_precise_infos)
141
- end
142
- collection.add_pm(pwm, info) unless skip_motif
143
- end
144
- $stderr.puts "100% complete. Saving results" unless silent
145
- File.open(output_file, 'w') do |f|
146
- f.puts(collection.to_yaml)
147
- end
148
- puts OutputInformation.new{|infos|
149
- infos.add_parameter('P', 'P-value list', pvalues.join(','))
150
- infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
151
- infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
152
- infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
153
- infos.background_parameter('B', 'background', background)
154
- }.result
155
- rescue => err
156
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
157
- end
158
-
159
- end
160
- end
161
- end
1
+ require_relative '../../macroape'
2
+ require 'yaml'
3
+ require 'shellwords'
4
+
5
+ module Macroape
6
+ module CLI
7
+ module PreprocessCollection
8
+
9
+ def self.motif_infos_from_file(filename)
10
+ input = File.read(filename)
11
+ motif_input = Bioinform::MatrixParser.new.parse(input)
12
+ { matrix: motif_input[:matrix],
13
+ name: motif_input[:name] || File.basename(filename, File.extname(filename)) }
14
+ end
15
+
16
+ def self.main(argv)
17
+ doc = <<-EOS.strip_doc
18
+ Command-line format:
19
+ #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
20
+
21
+ Options:
22
+ [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
23
+ [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
24
+ [--silent] - hide current progress information during scan (printed to stderr by default)
25
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
26
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
27
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
28
+
29
+ The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
30
+
31
+ Example:
32
+ #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
33
+ EOS
34
+
35
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
+ $stderr.puts doc
37
+ exit
38
+ end
39
+
40
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
41
+ default_pvalues = [0.0005]
42
+ background = Bioinform::Background::Wordwise
43
+ rough_discretization = 1
44
+ precise_discretization = 10
45
+ max_hash_size = 10000000
46
+
47
+ data_source = argv.shift
48
+ output_file = argv.shift
49
+
50
+ raise 'No input. You should specify file or folder with pwms' unless data_source
51
+ raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
52
+ raise 'You should specify output file' unless output_file
53
+
54
+ pvalues = []
55
+ silent = false
56
+ pvalue_boundary = :upper
57
+
58
+ until argv.empty?
59
+ case argv.shift
60
+ when '-b'
61
+ background = Bioinform::Background.from_string(argv.shift)
62
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background.symmetric?
63
+ when '-p'
64
+ pvalues = argv.shift.split(',').map(&:to_f)
65
+ when '-d'
66
+ rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
67
+ when '--max-hash-size'
68
+ max_hash_size = argv.shift.to_i
69
+ when '--silent'
70
+ silent = true
71
+ when '--boundary'
72
+ pvalue_boundary = argv.shift.to_sym
73
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
74
+ end
75
+ end
76
+ pvalues = default_pvalues if pvalues.empty?
77
+
78
+ data_source = data_source.gsub("\\",'/')
79
+
80
+ pcm2pwm_converter = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background)
81
+
82
+ if File.directory?(data_source)
83
+ motif_inputs = Dir.glob(File.join(data_source,'*')).sort.map{|filename| motif_infos_from_file(filename) }
84
+ elsif File.file?(data_source)
85
+ input = File.read(data_source)
86
+ motif_inputs = Bioinform::MotifSplitter.new.split(input).map{|motif_input| Bioinform::MatrixParser.new.parse(motif_input) }
87
+ elsif data_source == '.stdin'
88
+ filelist = $stdin.read.shellsplit
89
+ motif_inputs = filelist.map{|filename| motif_infos_from_file(filename) }
90
+ else
91
+ raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
92
+ end
93
+
94
+ pwms = motif_inputs.map{|motif_input|
95
+ if data_model == :pwm
96
+ pwm = Bioinform::MotifModel::PWM.new(motif_input[:matrix]).named(motif_input[:name])
97
+ elsif data_model == :pcm
98
+ pcm = Bioinform::MotifModel::PCM.new(motif_input[:matrix]).named(motif_input[:name])
99
+ pwm = pcm2pwm_converter.convert(pcm)
100
+ end
101
+ }
102
+
103
+ collection = Macroape::Collection.new(rough_discretization: rough_discretization,
104
+ precise_discretization: precise_discretization,
105
+ background: background,
106
+ pvalues: pvalues)
107
+
108
+ pwms.each_with_index do |pwm,index|
109
+ $stderr.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
110
+
111
+ # When support of onefile collections is introduced - then here should be check if name exists.
112
+ # Otherwise it should skip motif and tell you about this
113
+ # Also two command line options to fail on skipping or to skip silently should be included
114
+
115
+ info = {rough: {}, precise: {}, background: background}
116
+ skip_motif = false
117
+
118
+ fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
119
+ if real_pvalue == 0
120
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
121
+ else
122
+ info[:rough][pvalue] = threshold / rough_discretization
123
+ end
124
+ end
125
+
126
+ fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
127
+ if real_pvalue == 0
128
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
129
+ skip_motif = true
130
+ else
131
+ info[:precise][pvalue] = threshold / precise_discretization
132
+ end
133
+ end
134
+
135
+ rough_counting = PWMCounting.new(pwm.discreted(rough_discretization), background: background, max_hash_size: max_hash_size)
136
+ precise_counting = PWMCounting.new(pwm.discreted(precise_discretization), background: background, max_hash_size: max_hash_size)
137
+
138
+ if pvalue_boundary == :lower
139
+ rough_counting.thresholds(*pvalues, &fill_rough_infos)
140
+ else
141
+ rough_counting.weak_thresholds(*pvalues, &fill_rough_infos)
142
+ end
143
+
144
+ if pvalue_boundary == :lower
145
+ precise_counting.thresholds(*pvalues, &fill_precise_infos)
146
+ else
147
+ precise_counting.weak_thresholds(*pvalues,&fill_precise_infos)
148
+ end
149
+
150
+ collection << Macroape::MotifWithThresholds.new(pwm, info) unless skip_motif
151
+ end
152
+ $stderr.puts "100% complete. Saving results" unless silent
153
+ File.open(output_file, 'w') do |f|
154
+ f.puts(collection.to_yaml)
155
+ end
156
+ puts OutputInformation.new{|infos|
157
+ infos.add_parameter('P', 'P-value list', pvalues.join(','))
158
+ infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
159
+ infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
160
+ infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
161
+ infos.background_parameter('B', 'background', background)
162
+ }.result
163
+ rescue => err
164
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
165
+ end
166
+
167
+ end
168
+ end
169
+ end