macroape 4.0.2 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,137 +1,138 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module EvalSimilarity
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
11
-
12
- Options:
13
- [-p <P-value>]
14
- [-d <discretization level>]
15
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
- [--first-threshold <threshold for the first matrix>]
19
- [--second-threshold <threshold for the second matrix>]
20
-
21
- Examples:
22
- #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
- EOS
24
-
25
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
26
- $stderr.puts doc
27
- exit
28
- end
29
-
30
- pvalue = 0.0005
31
- discretization = 10.0
32
-
33
- first_background = [1,1,1,1]
34
- second_background = [1,1,1,1]
35
-
36
- max_hash_size = 10000000
37
- max_pair_hash_size = 10000
38
- pvalue_boundary = :upper
39
-
40
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
41
- first_file = argv.shift
42
- second_file = argv.shift
43
- raise 'You should specify two input files' unless first_file and second_file
44
-
45
- until argv.empty?
46
- case argv.shift
47
- when '-p'
48
- pvalue = argv.shift.to_f
49
- when '-d'
50
- discretization = argv.shift.to_f
51
- when '--max-hash-size'
52
- max_hash_size = argv.shift.to_i
53
- when '--max-2d-hash-size'
54
- max_pair_hash_size = argv.shift.to_i
55
- when '-b'
56
- second_background = first_background = argv.shift.split(',').map(&:to_f)
57
- when '-b1'
58
- first_background = argv.shift.split(',').map(&:to_f)
59
- when '-b2'
60
- second_background = argv.shift.split(',').map(&:to_f)
61
- when '--boundary'
62
- pvalue_boundary = argv.shift.to_sym
63
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
64
- when '--first-threshold'
65
- predefined_threshold_first = argv.shift.to_f
66
- when '--second-threshold'
67
- predefined_threshold_second = argv.shift.to_f
68
- end
69
- end
70
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
71
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
72
-
73
- if first_file == '.stdin' || second_file == '.stdin'
74
- input = $stdin.read
75
- parser = data_model.choose_parser(input).new(input)
76
- end
77
-
78
- if first_file == '.stdin'
79
- input_first = parser.parse
80
- else
81
- raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
82
- input_first = File.read(first_file)
83
- end
84
- pwm_first = data_model.new(input_first).set_parameters(background: first_background).to_pwm
85
-
86
- if second_file == '.stdin'
87
- input_second = parser.parse
88
- else
89
- raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
90
- input_second = File.read(second_file)
91
- end
92
- pwm_second = data_model.new(input_second).set_parameters(background: second_background).to_pwm
93
-
94
- pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
95
- pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
96
-
97
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
98
-
99
- if predefined_threshold_first
100
- threshold_first = predefined_threshold_first * discretization
101
- else
102
- if pvalue_boundary == :lower
103
- threshold_first = pwm_first.threshold(pvalue)
104
- else
105
- threshold_first = pwm_first.weak_threshold(pvalue)
106
- end
107
- end
108
-
109
- if predefined_threshold_second
110
- threshold_second = predefined_threshold_second * discretization
111
- else
112
- if pvalue_boundary == :lower
113
- threshold_second = pwm_second.threshold(pvalue)
114
- else
115
- threshold_second = pwm_second.weak_threshold(pvalue)
116
- end
117
- end
118
-
119
- info = cmp.jaccard(threshold_first, threshold_second)
120
- info.merge!(predefined_threshold_first: predefined_threshold_first,
121
- predefined_threshold_second: predefined_threshold_second,
122
- threshold_first: threshold_first.to_f / discretization,
123
- threshold_second: threshold_second.to_f / discretization,
124
- discretization: discretization,
125
- first_background: first_background,
126
- second_background: second_background,
127
- requested_pvalue: pvalue,
128
- pvalue_boundary: pvalue_boundary)
129
- puts Helper.similarity_info_string(info)
130
-
131
- rescue => err
132
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
133
- end
134
-
135
- end
136
- end
137
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module EvalSimilarity
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-d <discretization level>]
15
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
+ [--first-threshold <threshold for the first matrix>]
19
+ [--second-threshold <threshold for the second matrix>]
20
+
21
+ Examples:
22
+ #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
+ EOS
24
+
25
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
26
+ $stderr.puts doc
27
+ exit
28
+ end
29
+
30
+ pvalue = 0.0005
31
+ discretization = 10.0
32
+
33
+ first_background = Bioinform::Background::Wordwise
34
+ second_background = Bioinform::Background::Wordwise
35
+
36
+ max_hash_size = 10000000
37
+ max_pair_hash_size = 10000
38
+ pvalue_boundary = :upper
39
+
40
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
41
+ first_file = argv.shift
42
+ second_file = argv.shift
43
+ raise 'You should specify two input files' unless first_file and second_file
44
+
45
+ until argv.empty?
46
+ case argv.shift
47
+ when '-p'
48
+ pvalue = argv.shift.to_f
49
+ when '-d'
50
+ discretization = argv.shift.to_f
51
+ when '--max-hash-size'
52
+ max_hash_size = argv.shift.to_i
53
+ when '--max-2d-hash-size'
54
+ max_pair_hash_size = argv.shift.to_i
55
+ when '-b'
56
+ second_background = first_background = Bioinform::Background.from_string(argv.shift)
57
+ when '-b1'
58
+ first_background = Bioinform::Background.from_string(argv.shift)
59
+ when '-b2'
60
+ second_background = Bioinform::Background.from_string(argv.shift)
61
+ when '--boundary'
62
+ pvalue_boundary = argv.shift.to_sym
63
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
64
+ when '--first-threshold'
65
+ predefined_threshold_first = argv.shift.to_f
66
+ when '--second-threshold'
67
+ predefined_threshold_second = argv.shift.to_f
68
+ end
69
+ end
70
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background.symmetric?
71
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background.symmetric?
72
+
73
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
74
+ input_first = File.read(first_file)
75
+ input_first = Bioinform::MatrixParser.new.parse!(input_first)
76
+
77
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
78
+ input_second = File.read(second_file)
79
+ input_second = Bioinform::MatrixParser.new.parse!(input_second)
80
+
81
+ case data_model
82
+ when :pcm
83
+ pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])
84
+ pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: first_background).convert(pcm_first)
85
+ pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])
86
+ pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: second_background).convert(pcm_second)
87
+ when :pwm
88
+ pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])
89
+ pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])
90
+ end
91
+
92
+ pwm_first = pwm_first.discreted(discretization)
93
+ pwm_second = pwm_second.discreted(discretization)
94
+
95
+ counting_first = PWMCounting.new(pwm_first, background: first_background, max_hash_size: max_hash_size)
96
+ counting_second = PWMCounting.new(pwm_second, background: second_background, max_hash_size: max_hash_size)
97
+
98
+ cmp = Macroape::PWMCompare.new(counting_first, counting_second).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
99
+
100
+ if predefined_threshold_first
101
+ threshold_first = predefined_threshold_first * discretization
102
+ else
103
+ if pvalue_boundary == :lower
104
+ threshold_first = counting_first.threshold(pvalue)
105
+ else
106
+ threshold_first = counting_first.weak_threshold(pvalue)
107
+ end
108
+ end
109
+
110
+ if predefined_threshold_second
111
+ threshold_second = predefined_threshold_second * discretization
112
+ else
113
+ if pvalue_boundary == :lower
114
+ threshold_second = counting_second.threshold(pvalue)
115
+ else
116
+ threshold_second = counting_second.weak_threshold(pvalue)
117
+ end
118
+ end
119
+
120
+ info = cmp.jaccard(threshold_first, threshold_second)
121
+ info.merge!(predefined_threshold_first: predefined_threshold_first,
122
+ predefined_threshold_second: predefined_threshold_second,
123
+ threshold_first: threshold_first.to_f / discretization,
124
+ threshold_second: threshold_second.to_f / discretization,
125
+ discretization: discretization,
126
+ first_background: first_background,
127
+ second_background: second_background,
128
+ requested_pvalue: pvalue,
129
+ pvalue_boundary: pvalue_boundary)
130
+ puts Helper.similarity_info_string(info)
131
+
132
+ rescue => err
133
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -1,87 +1,93 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module FindPValue
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> <threshold list>... [options]
11
-
12
- Options:
13
- [-d <discretization level>]
14
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
16
-
17
- Examples:
18
- #{run_tool_cmd} motifs/KLF4_f2.pat 7.32
19
- #{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
20
- EOS
21
-
22
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
23
- $stderr.puts doc
24
- exit
25
- end
26
-
27
- discretization = 10000
28
- background = [1,1,1,1]
29
- thresholds = []
30
- max_hash_size = 10000000
31
-
32
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
33
- filename = argv.shift
34
-
35
- loop do
36
- begin
37
- Float(argv.first)
38
- thresholds << argv.shift.to_f
39
- rescue
40
- raise StopIteration
41
- end
42
- end
43
-
44
- raise 'No input. You should specify input file' unless filename
45
- raise 'You should specify at least one threshold' if thresholds.empty?
46
-
47
- until argv.empty?
48
- case argv.shift
49
- when '-b'
50
- background = argv.shift.split(',').map(&:to_f)
51
- when '-d'
52
- discretization = argv.shift.to_f
53
- when '--max-hash-size'
54
- max_hash_size = argv.shift.to_i
55
- end
56
- end
57
-
58
-
59
- if filename == '.stdin'
60
- input = $stdin.read
61
- else
62
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
63
- input = File.read(filename)
64
- end
65
- pwm = data_model.new(input).set_parameters(background: background).to_pwm
66
- pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
67
-
68
- counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
69
- infos = []
70
- thresholds.each do |threshold|
71
- count = counts[threshold * discretization]
72
- pvalue = count.to_f / pwm.vocabulary_volume
73
- infos << {threshold: threshold,
74
- number_of_recognized_words: count,
75
- pvalue: pvalue}
76
- end
77
-
78
- puts Helper.find_pvalue_info_string( infos,
79
- {discretization: discretization,
80
- background: background} )
81
- rescue => err
82
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
83
- end
84
-
85
- end
86
- end
87
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindPValue
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> <threshold list>... [options]
11
+
12
+ Options:
13
+ [-d <discretization level>]
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
16
+
17
+ Examples:
18
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
20
+ EOS
21
+
22
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
23
+ $stderr.puts doc
24
+ exit
25
+ end
26
+
27
+ discretization = 10000
28
+ background = Bioinform::Background::Wordwise
29
+ thresholds = []
30
+ max_hash_size = 10000000
31
+
32
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
33
+ filename = argv.shift
34
+
35
+ loop do
36
+ begin
37
+ Float(argv.first)
38
+ thresholds << argv.shift.to_f
39
+ rescue
40
+ raise StopIteration
41
+ end
42
+ end
43
+
44
+ raise 'No input. You should specify input file' unless filename
45
+ raise 'You should specify at least one threshold' if thresholds.empty?
46
+
47
+ until argv.empty?
48
+ case argv.shift
49
+ when '-b'
50
+ background = Bioinform::Background.from_string(argv.shift)
51
+ when '-d'
52
+ discretization = argv.shift.to_f
53
+ when '--max-hash-size'
54
+ max_hash_size = argv.shift.to_i
55
+ end
56
+ end
57
+
58
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
59
+ input = File.read(filename)
60
+
61
+ parser = Bioinform::MatrixParser.new
62
+ motif_data = parser.parse!(input)
63
+ case data_model
64
+ when :pcm
65
+ pcm = Bioinform::MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
66
+ pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background).convert(pcm)
67
+ when :pwm
68
+ pwm = Bioinform::MotifModel::PWM.new(motif_data[:matrix]).named(motif_data[:name])
69
+ end
70
+
71
+ pwm = pwm.discreted(discretization)
72
+ counting = PWMCounting.new(pwm, background: background, max_hash_size: max_hash_size)
73
+
74
+ counts = counting.counts_by_thresholds(* thresholds.map{|count| count * discretization})
75
+ infos = []
76
+ thresholds.each do |threshold|
77
+ count = counts[threshold * discretization]
78
+ pvalue = count.to_f / (counting.vocabulary_volume)
79
+ infos << {threshold: threshold,
80
+ number_of_recognized_words: count,
81
+ pvalue: pvalue}
82
+ end
83
+
84
+ puts Helper.find_pvalue_info_string(infos,
85
+ {discretization: discretization,
86
+ background: background} )
87
+ rescue => err
88
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
89
+ end
90
+
91
+ end
92
+ end
93
+ end