macroape 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,96 +1,103 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module FindThreshold
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
11
-
12
- Options:
13
- [-d <discretization level>]
14
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
- [--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
16
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
17
-
18
- Example:
19
- #{run_tool_cmd} motifs/KLF4_f2.pat
20
- #{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
21
- EOS
22
-
23
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
24
- $stderr.puts doc
25
- exit
26
- end
27
-
28
- background = [1,1,1,1]
29
- default_pvalues = [0.0005]
30
- discretization = 10000
31
- max_hash_size = 10000000
32
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
33
-
34
- pvalue_boundary = :lower
35
-
36
-
37
- filename = argv.shift
38
- raise 'No input. You should specify input file' unless filename
39
-
40
- pvalues = []
41
- loop do
42
- begin
43
- Float(argv.first)
44
- pvalues << argv.shift.to_f
45
- rescue
46
- raise StopIteration
47
- end
48
- end
49
- pvalues = default_pvalues if pvalues.empty?
50
-
51
- until argv.empty?
52
- case argv.shift
53
- when '-b'
54
- background = argv.shift.split(',').map(&:to_f)
55
- when '--max-hash-size'
56
- max_hash_size = argv.shift.to_i
57
- when '-d'
58
- discretization = argv.shift.to_f
59
- when '--boundary'
60
- pvalue_boundary = argv.shift.to_sym
61
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
62
- end
63
- end
64
-
65
- if filename == '.stdin'
66
- input = $stdin.read
67
- else
68
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
- input = File.read(filename)
70
- end
71
- pwm = data_model.new(input).set_parameters(background: background).to_pwm
72
- pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
73
-
74
- infos = []
75
- collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
76
- infos << {expected_pvalue: pvalue,
77
- threshold: threshold / discretization,
78
- real_pvalue: real_pvalue,
79
- recognized_words: pwm.vocabulary_volume * real_pvalue }
80
- end
81
- if pvalue_boundary == :lower
82
- pwm.thresholds(*pvalues, &collect_infos_proc)
83
- else
84
- pwm.weak_thresholds(*pvalues, &collect_infos_proc)
85
- end
86
- puts Helper.threshold_infos_string(infos,
87
- {discretization: discretization,
88
- background: background,
89
- pvalue_boundary: pvalue_boundary} )
90
- rescue => err
91
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
92
- end
93
-
94
- end
95
- end
96
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindThreshold
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> [<list of P-values>...] [options]
11
+
12
+ Options:
13
+ [-d <discretization level>]
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [--boundary lower|upper] Lower boundary (default) means that the obtained P-value is less than or equal to the requested P-value
16
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
17
+
18
+ Example:
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat
20
+ #{run_tool_cmd} motifs/KLF4_f2.pat 0.001 0.0001 0.0005 -d 1000 -b 0.4,0.3,0.2,0.1
21
+ EOS
22
+
23
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
24
+ $stderr.puts doc
25
+ exit
26
+ end
27
+
28
+ background = Bioinform::Background::Wordwise
29
+ default_pvalues = [0.0005]
30
+ discretization = 10000
31
+ max_hash_size = 10000000
32
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
33
+
34
+ pvalue_boundary = :lower
35
+
36
+
37
+ filename = argv.shift
38
+ raise 'No input. You should specify input file' unless filename
39
+
40
+ pvalues = []
41
+ loop do
42
+ begin
43
+ Float(argv.first)
44
+ pvalues << argv.shift.to_f
45
+ rescue
46
+ raise StopIteration
47
+ end
48
+ end
49
+ pvalues = default_pvalues if pvalues.empty?
50
+
51
+ until argv.empty?
52
+ case argv.shift
53
+ when '-b'
54
+ background = Bioinform::Background.from_string(argv.shift)
55
+ when '--max-hash-size'
56
+ max_hash_size = argv.shift.to_i
57
+ when '-d'
58
+ discretization = argv.shift.to_f
59
+ when '--boundary'
60
+ pvalue_boundary = argv.shift.to_sym
61
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
62
+ end
63
+ end
64
+
65
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
66
+ input = File.read(filename)
67
+
68
+ parser = Bioinform::MatrixParser.new
69
+ motif_data = parser.parse!(input)
70
+ case data_model
71
+ when :pcm
72
+ pcm = Bioinform::MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
73
+ pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background).convert(pcm)
74
+ when :pwm
75
+ pwm = Bioinform::MotifModel::PWM.new(motif_data[:matrix]).named(motif_data[:name])
76
+ end
77
+
78
+ pwm = pwm.discreted(discretization)
79
+ counting = PWMCounting.new(pwm, background: background, max_hash_size: max_hash_size)
80
+
81
+ infos = []
82
+ collect_infos_proc = ->(pvalue, threshold, real_pvalue) do
83
+ infos << {expected_pvalue: pvalue,
84
+ threshold: threshold / discretization,
85
+ real_pvalue: real_pvalue,
86
+ recognized_words: real_pvalue * counting.vocabulary_volume }
87
+ end
88
+ if pvalue_boundary == :lower
89
+ counting.thresholds(*pvalues, &collect_infos_proc)
90
+ else
91
+ counting.weak_thresholds(*pvalues, &collect_infos_proc)
92
+ end
93
+ puts Helper.threshold_infos_string(infos,
94
+ {discretization: discretization,
95
+ background: background,
96
+ pvalue_boundary: pvalue_boundary} )
97
+ rescue => err
98
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
99
+ end
100
+
101
+ end
102
+ end
103
+ end
@@ -1,161 +1,169 @@
1
- require_relative '../../macroape'
2
- require 'yaml'
3
- require 'shellwords'
4
-
5
- module Macroape
6
- module CLI
7
- module PreprocessCollection
8
-
9
- def self.main(argv)
10
- doc = <<-EOS.strip_doc
11
- Command-line format:
12
- #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
13
-
14
- Options:
15
- [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
16
- [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
17
- [--silent] - hide current progress information during scan (printed to stderr by default)
18
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
19
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
20
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
21
-
22
- The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
23
-
24
- Example:
25
- #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
26
- EOS
27
-
28
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
29
- $stderr.puts doc
30
- exit
31
- end
32
-
33
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
34
-
35
- default_pvalues = [0.0005]
36
- background = [1,1,1,1]
37
- rough_discretization = 1
38
- precise_discretization = 10
39
- max_hash_size = 10000000
40
-
41
- data_source = argv.shift
42
- output_file = argv.shift
43
-
44
- raise 'No input. You should specify file or folder with pwms' unless data_source
45
- raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
46
- raise 'You should specify output file' unless output_file
47
-
48
- pvalues = []
49
- silent = false
50
- pvalue_boundary = :upper
51
-
52
- until argv.empty?
53
- case argv.shift
54
- when '-b'
55
- background = argv.shift.split(',').map(&:to_f)
56
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
57
- when '-p'
58
- pvalues = argv.shift.split(',').map(&:to_f)
59
- when '-d'
60
- rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
61
- when '--max-hash-size'
62
- max_hash_size = argv.shift.to_i
63
- when '--silent'
64
- silent = true
65
- when '--boundary'
66
- pvalue_boundary = argv.shift.to_sym
67
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
68
- end
69
- end
70
- pvalues = default_pvalues if pvalues.empty?
71
-
72
- collection = Bioinform::Collection.new(rough_discretization: rough_discretization,
73
- precise_discretization: precise_discretization,
74
- background: background,
75
- pvalues: pvalues)
76
-
77
- data_source = data_source.gsub("\\",'/')
78
- if File.directory?(data_source)
79
- motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
80
- pwm = data_model.new(File.read(filename))
81
- pwm.name ||= File.basename(filename, File.extname(filename))
82
- pwm
83
- end
84
- elsif File.file?(data_source)
85
- input = File.read(data_source)
86
- motifs = data_model.split_on_motifs(input)
87
- elsif data_source == '.stdin'
88
- filelist = $stdin.read.shellsplit
89
- motifs = []
90
- filelist.each do |filename|
91
- motif = data_model.new(File.read(filename))
92
- motif.name ||= File.basename(filename, File.extname(filename))
93
- motif.set_parameters(background: background)
94
- motifs << motif
95
- end
96
- else
97
- raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
98
- end
99
-
100
- pwms = motifs.map(&:to_pwm)
101
-
102
- pwms.each_with_index do |pwm,index|
103
- $stderr.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
104
-
105
- # When support of onefile collections is introduced - then here should be check if name exists.
106
- # Otherwise it should skip motif and tell you about this
107
- # Also two command line options to fail on skipping or to skip silently should be included
108
-
109
- info = OpenStruct.new(rough: {}, precise: {})
110
- pwm.set_parameters(background: background, max_hash_size: max_hash_size)
111
- skip_motif = false
112
-
113
-
114
- fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
115
- if real_pvalue == 0
116
- $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
117
- else
118
- info.rough[pvalue] = threshold / rough_discretization
119
- end
120
- end
121
-
122
- fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
123
- if real_pvalue == 0
124
- $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
125
- skip_motif = true
126
- else
127
- info.precise[pvalue] = threshold / precise_discretization
128
- end
129
- end
130
-
131
- if pvalue_boundary == :lower
132
- pwm.discrete(rough_discretization).thresholds(*pvalues, &fill_rough_infos)
133
- else
134
- pwm.discrete(rough_discretization).weak_thresholds(*pvalues, &fill_rough_infos)
135
- end
136
-
137
- if pvalue_boundary == :lower
138
- pwm.discrete(precise_discretization).thresholds(*pvalues, &fill_precise_infos)
139
- else
140
- pwm.discrete(precise_discretization).weak_thresholds(*pvalues,&fill_precise_infos)
141
- end
142
- collection.add_pm(pwm, info) unless skip_motif
143
- end
144
- $stderr.puts "100% complete. Saving results" unless silent
145
- File.open(output_file, 'w') do |f|
146
- f.puts(collection.to_yaml)
147
- end
148
- puts OutputInformation.new{|infos|
149
- infos.add_parameter('P', 'P-value list', pvalues.join(','))
150
- infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
151
- infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
152
- infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
153
- infos.background_parameter('B', 'background', background)
154
- }.result
155
- rescue => err
156
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
157
- end
158
-
159
- end
160
- end
161
- end
1
+ require_relative '../../macroape'
2
+ require 'yaml'
3
+ require 'shellwords'
4
+
5
+ module Macroape
6
+ module CLI
7
+ module PreprocessCollection
8
+
9
+ def self.motif_infos_from_file(filename)
10
+ input = File.read(filename)
11
+ motif_input = Bioinform::MatrixParser.new.parse(input)
12
+ { matrix: motif_input[:matrix],
13
+ name: motif_input[:name] || File.basename(filename, File.extname(filename)) }
14
+ end
15
+
16
+ def self.main(argv)
17
+ doc = <<-EOS.strip_doc
18
+ Command-line format:
19
+ #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options]
20
+
21
+ Options:
22
+ [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds
23
+ [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter
24
+ [--silent] - hide current progress information during scan (printed to stderr by default)
25
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
26
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
27
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
28
+
29
+ The tool preprocesses and stores Macroape motif collection in the specified YAML-file.
30
+
31
+ Example:
32
+ #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2
33
+ EOS
34
+
35
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
+ $stderr.puts doc
37
+ exit
38
+ end
39
+
40
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
41
+ default_pvalues = [0.0005]
42
+ background = Bioinform::Background::Wordwise
43
+ rough_discretization = 1
44
+ precise_discretization = 10
45
+ max_hash_size = 10000000
46
+
47
+ data_source = argv.shift
48
+ output_file = argv.shift
49
+
50
+ raise 'No input. You should specify file or folder with pwms' unless data_source
51
+ raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin'
52
+ raise 'You should specify output file' unless output_file
53
+
54
+ pvalues = []
55
+ silent = false
56
+ pvalue_boundary = :upper
57
+
58
+ until argv.empty?
59
+ case argv.shift
60
+ when '-b'
61
+ background = Bioinform::Background.from_string(argv.shift)
62
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background.symmetric?
63
+ when '-p'
64
+ pvalues = argv.shift.split(',').map(&:to_f)
65
+ when '-d'
66
+ rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort
67
+ when '--max-hash-size'
68
+ max_hash_size = argv.shift.to_i
69
+ when '--silent'
70
+ silent = true
71
+ when '--boundary'
72
+ pvalue_boundary = argv.shift.to_sym
73
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
74
+ end
75
+ end
76
+ pvalues = default_pvalues if pvalues.empty?
77
+
78
+ data_source = data_source.gsub("\\",'/')
79
+
80
+ pcm2pwm_converter = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background)
81
+
82
+ if File.directory?(data_source)
83
+ motif_inputs = Dir.glob(File.join(data_source,'*')).sort.map{|filename| motif_infos_from_file(filename) }
84
+ elsif File.file?(data_source)
85
+ input = File.read(data_source)
86
+ motif_inputs = Bioinform::MotifSplitter.new.split(input).map{|motif_input| Bioinform::MatrixParser.new.parse(motif_input) }
87
+ elsif data_source == '.stdin'
88
+ filelist = $stdin.read.shellsplit
89
+ motif_inputs = filelist.map{|filename| motif_infos_from_file(filename) }
90
+ else
91
+ raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
92
+ end
93
+
94
+ pwms = motif_inputs.map{|motif_input|
95
+ if data_model == :pwm
96
+ pwm = Bioinform::MotifModel::PWM.new(motif_input[:matrix]).named(motif_input[:name])
97
+ elsif data_model == :pcm
98
+ pcm = Bioinform::MotifModel::PCM.new(motif_input[:matrix]).named(motif_input[:name])
99
+ pwm = pcm2pwm_converter.convert(pcm)
100
+ end
101
+ }
102
+
103
+ collection = Macroape::Collection.new(rough_discretization: rough_discretization,
104
+ precise_discretization: precise_discretization,
105
+ background: background,
106
+ pvalues: pvalues)
107
+
108
+ pwms.each_with_index do |pwm,index|
109
+ $stderr.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent
110
+
111
+ # When support of onefile collections is introduced - then here should be check if name exists.
112
+ # Otherwise it should skip motif and tell you about this
113
+ # Also two command line options to fail on skipping or to skip silently should be included
114
+
115
+ info = {rough: {}, precise: {}, background: background}
116
+ skip_motif = false
117
+
118
+ fill_rough_infos = ->(pvalue, threshold, real_pvalue) do
119
+ if real_pvalue == 0
120
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
121
+ else
122
+ info[:rough][pvalue] = threshold / rough_discretization
123
+ end
124
+ end
125
+
126
+ fill_precise_infos = ->(pvalue, threshold, real_pvalue) do
127
+ if real_pvalue == 0
128
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
129
+ skip_motif = true
130
+ else
131
+ info[:precise][pvalue] = threshold / precise_discretization
132
+ end
133
+ end
134
+
135
+ rough_counting = PWMCounting.new(pwm.discreted(rough_discretization), background: background, max_hash_size: max_hash_size)
136
+ precise_counting = PWMCounting.new(pwm.discreted(precise_discretization), background: background, max_hash_size: max_hash_size)
137
+
138
+ if pvalue_boundary == :lower
139
+ rough_counting.thresholds(*pvalues, &fill_rough_infos)
140
+ else
141
+ rough_counting.weak_thresholds(*pvalues, &fill_rough_infos)
142
+ end
143
+
144
+ if pvalue_boundary == :lower
145
+ precise_counting.thresholds(*pvalues, &fill_precise_infos)
146
+ else
147
+ precise_counting.weak_thresholds(*pvalues,&fill_precise_infos)
148
+ end
149
+
150
+ collection << Macroape::MotifWithThresholds.new(pwm, info) unless skip_motif
151
+ end
152
+ $stderr.puts "100% complete. Saving results" unless silent
153
+ File.open(output_file, 'w') do |f|
154
+ f.puts(collection.to_yaml)
155
+ end
156
+ puts OutputInformation.new{|infos|
157
+ infos.add_parameter('P', 'P-value list', pvalues.join(','))
158
+ infos.add_parameter('VR', 'discretization value, rough', rough_discretization)
159
+ infos.add_parameter('VP', 'discretization value, precise', precise_discretization)
160
+ infos.add_parameter('PB', 'P-value boundary', pvalue_boundary)
161
+ infos.background_parameter('B', 'background', background)
162
+ }.result
163
+ rescue => err
164
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
165
+ end
166
+
167
+ end
168
+ end
169
+ end