macroape 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
@@ -1,137 +1,138 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module EvalSimilarity
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
11
-
12
- Options:
13
- [-p <P-value>]
14
- [-d <discretization level>]
15
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
- [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
- [--first-threshold <threshold for the first matrix>]
19
- [--second-threshold <threshold for the second matrix>]
20
-
21
- Examples:
22
- #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
- EOS
24
-
25
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
26
- $stderr.puts doc
27
- exit
28
- end
29
-
30
- pvalue = 0.0005
31
- discretization = 10.0
32
-
33
- first_background = [1,1,1,1]
34
- second_background = [1,1,1,1]
35
-
36
- max_hash_size = 10000000
37
- max_pair_hash_size = 10000
38
- pvalue_boundary = :upper
39
-
40
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
41
- first_file = argv.shift
42
- second_file = argv.shift
43
- raise 'You should specify two input files' unless first_file and second_file
44
-
45
- until argv.empty?
46
- case argv.shift
47
- when '-p'
48
- pvalue = argv.shift.to_f
49
- when '-d'
50
- discretization = argv.shift.to_f
51
- when '--max-hash-size'
52
- max_hash_size = argv.shift.to_i
53
- when '--max-2d-hash-size'
54
- max_pair_hash_size = argv.shift.to_i
55
- when '-b'
56
- second_background = first_background = argv.shift.split(',').map(&:to_f)
57
- when '-b1'
58
- first_background = argv.shift.split(',').map(&:to_f)
59
- when '-b2'
60
- second_background = argv.shift.split(',').map(&:to_f)
61
- when '--boundary'
62
- pvalue_boundary = argv.shift.to_sym
63
- raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
64
- when '--first-threshold'
65
- predefined_threshold_first = argv.shift.to_f
66
- when '--second-threshold'
67
- predefined_threshold_second = argv.shift.to_f
68
- end
69
- end
70
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
71
- raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
72
-
73
- if first_file == '.stdin' || second_file == '.stdin'
74
- input = $stdin.read
75
- parser = data_model.choose_parser(input).new(input)
76
- end
77
-
78
- if first_file == '.stdin'
79
- input_first = parser.parse
80
- else
81
- raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
82
- input_first = File.read(first_file)
83
- end
84
- pwm_first = data_model.new(input_first).set_parameters(background: first_background).to_pwm
85
-
86
- if second_file == '.stdin'
87
- input_second = parser.parse
88
- else
89
- raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
90
- input_second = File.read(second_file)
91
- end
92
- pwm_second = data_model.new(input_second).set_parameters(background: second_background).to_pwm
93
-
94
- pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
95
- pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
96
-
97
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
98
-
99
- if predefined_threshold_first
100
- threshold_first = predefined_threshold_first * discretization
101
- else
102
- if pvalue_boundary == :lower
103
- threshold_first = pwm_first.threshold(pvalue)
104
- else
105
- threshold_first = pwm_first.weak_threshold(pvalue)
106
- end
107
- end
108
-
109
- if predefined_threshold_second
110
- threshold_second = predefined_threshold_second * discretization
111
- else
112
- if pvalue_boundary == :lower
113
- threshold_second = pwm_second.threshold(pvalue)
114
- else
115
- threshold_second = pwm_second.weak_threshold(pvalue)
116
- end
117
- end
118
-
119
- info = cmp.jaccard(threshold_first, threshold_second)
120
- info.merge!(predefined_threshold_first: predefined_threshold_first,
121
- predefined_threshold_second: predefined_threshold_second,
122
- threshold_first: threshold_first.to_f / discretization,
123
- threshold_second: threshold_second.to_f / discretization,
124
- discretization: discretization,
125
- first_background: first_background,
126
- second_background: second_background,
127
- requested_pvalue: pvalue,
128
- pvalue_boundary: pvalue_boundary)
129
- puts Helper.similarity_info_string(info)
130
-
131
- rescue => err
132
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
133
- end
134
-
135
- end
136
- end
137
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module EvalSimilarity
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-d <discretization level>]
15
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
+ [--first-threshold <threshold for the first matrix>]
19
+ [--second-threshold <threshold for the second matrix>]
20
+
21
+ Examples:
22
+ #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
+ EOS
24
+
25
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
26
+ $stderr.puts doc
27
+ exit
28
+ end
29
+
30
+ pvalue = 0.0005
31
+ discretization = 10.0
32
+
33
+ first_background = Bioinform::Background::Wordwise
34
+ second_background = Bioinform::Background::Wordwise
35
+
36
+ max_hash_size = 10000000
37
+ max_pair_hash_size = 10000
38
+ pvalue_boundary = :upper
39
+
40
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
41
+ first_file = argv.shift
42
+ second_file = argv.shift
43
+ raise 'You should specify two input files' unless first_file and second_file
44
+
45
+ until argv.empty?
46
+ case argv.shift
47
+ when '-p'
48
+ pvalue = argv.shift.to_f
49
+ when '-d'
50
+ discretization = argv.shift.to_f
51
+ when '--max-hash-size'
52
+ max_hash_size = argv.shift.to_i
53
+ when '--max-2d-hash-size'
54
+ max_pair_hash_size = argv.shift.to_i
55
+ when '-b'
56
+ second_background = first_background = Bioinform::Background.from_string(argv.shift)
57
+ when '-b1'
58
+ first_background = Bioinform::Background.from_string(argv.shift)
59
+ when '-b2'
60
+ second_background = Bioinform::Background.from_string(argv.shift)
61
+ when '--boundary'
62
+ pvalue_boundary = argv.shift.to_sym
63
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
64
+ when '--first-threshold'
65
+ predefined_threshold_first = argv.shift.to_f
66
+ when '--second-threshold'
67
+ predefined_threshold_second = argv.shift.to_f
68
+ end
69
+ end
70
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background.symmetric?
71
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background.symmetric?
72
+
73
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
74
+ input_first = File.read(first_file)
75
+ input_first = Bioinform::MatrixParser.new.parse!(input_first)
76
+
77
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
78
+ input_second = File.read(second_file)
79
+ input_second = Bioinform::MatrixParser.new.parse!(input_second)
80
+
81
+ case data_model
82
+ when :pcm
83
+ pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])
84
+ pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: first_background).convert(pcm_first)
85
+ pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])
86
+ pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: second_background).convert(pcm_second)
87
+ when :pwm
88
+ pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])
89
+ pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])
90
+ end
91
+
92
+ pwm_first = pwm_first.discreted(discretization)
93
+ pwm_second = pwm_second.discreted(discretization)
94
+
95
+ counting_first = PWMCounting.new(pwm_first, background: first_background, max_hash_size: max_hash_size)
96
+ counting_second = PWMCounting.new(pwm_second, background: second_background, max_hash_size: max_hash_size)
97
+
98
+ cmp = Macroape::PWMCompare.new(counting_first, counting_second).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
99
+
100
+ if predefined_threshold_first
101
+ threshold_first = predefined_threshold_first * discretization
102
+ else
103
+ if pvalue_boundary == :lower
104
+ threshold_first = counting_first.threshold(pvalue)
105
+ else
106
+ threshold_first = counting_first.weak_threshold(pvalue)
107
+ end
108
+ end
109
+
110
+ if predefined_threshold_second
111
+ threshold_second = predefined_threshold_second * discretization
112
+ else
113
+ if pvalue_boundary == :lower
114
+ threshold_second = counting_second.threshold(pvalue)
115
+ else
116
+ threshold_second = counting_second.weak_threshold(pvalue)
117
+ end
118
+ end
119
+
120
+ info = cmp.jaccard(threshold_first, threshold_second)
121
+ info.merge!(predefined_threshold_first: predefined_threshold_first,
122
+ predefined_threshold_second: predefined_threshold_second,
123
+ threshold_first: threshold_first.to_f / discretization,
124
+ threshold_second: threshold_second.to_f / discretization,
125
+ discretization: discretization,
126
+ first_background: first_background,
127
+ second_background: second_background,
128
+ requested_pvalue: pvalue,
129
+ pvalue_boundary: pvalue_boundary)
130
+ puts Helper.similarity_info_string(info)
131
+
132
+ rescue => err
133
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -1,87 +1,93 @@
1
- require_relative '../../macroape'
2
-
3
- module Macroape
4
- module CLI
5
- module FindPValue
6
-
7
- def self.main(argv)
8
- doc = <<-EOS.strip_doc
9
- Command-line format:
10
- #{run_tool_cmd} <pat-file> <threshold list>... [options]
11
-
12
- Options:
13
- [-d <discretization level>]
14
- [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
- [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
16
-
17
- Examples:
18
- #{run_tool_cmd} motifs/KLF4_f2.pat 7.32
19
- #{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
20
- EOS
21
-
22
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
23
- $stderr.puts doc
24
- exit
25
- end
26
-
27
- discretization = 10000
28
- background = [1,1,1,1]
29
- thresholds = []
30
- max_hash_size = 10000000
31
-
32
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
33
- filename = argv.shift
34
-
35
- loop do
36
- begin
37
- Float(argv.first)
38
- thresholds << argv.shift.to_f
39
- rescue
40
- raise StopIteration
41
- end
42
- end
43
-
44
- raise 'No input. You should specify input file' unless filename
45
- raise 'You should specify at least one threshold' if thresholds.empty?
46
-
47
- until argv.empty?
48
- case argv.shift
49
- when '-b'
50
- background = argv.shift.split(',').map(&:to_f)
51
- when '-d'
52
- discretization = argv.shift.to_f
53
- when '--max-hash-size'
54
- max_hash_size = argv.shift.to_i
55
- end
56
- end
57
-
58
-
59
- if filename == '.stdin'
60
- input = $stdin.read
61
- else
62
- raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
63
- input = File.read(filename)
64
- end
65
- pwm = data_model.new(input).set_parameters(background: background).to_pwm
66
- pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
67
-
68
- counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
69
- infos = []
70
- thresholds.each do |threshold|
71
- count = counts[threshold * discretization]
72
- pvalue = count.to_f / pwm.vocabulary_volume
73
- infos << {threshold: threshold,
74
- number_of_recognized_words: count,
75
- pvalue: pvalue}
76
- end
77
-
78
- puts Helper.find_pvalue_info_string( infos,
79
- {discretization: discretization,
80
- background: background} )
81
- rescue => err
82
- $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
83
- end
84
-
85
- end
86
- end
87
- end
1
+ require_relative '../../macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindPValue
6
+
7
+ def self.main(argv)
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <pat-file> <threshold list>... [options]
11
+
12
+ Options:
13
+ [-d <discretization level>]
14
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
15
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
16
+
17
+ Examples:
18
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32
19
+ #{run_tool_cmd} motifs/KLF4_f2.pat 7.32 4.31 5.42 -d 1000 -b 0.2,0.3,0.3,0.2
20
+ EOS
21
+
22
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
23
+ $stderr.puts doc
24
+ exit
25
+ end
26
+
27
+ discretization = 10000
28
+ background = Bioinform::Background::Wordwise
29
+ thresholds = []
30
+ max_hash_size = 10000000
31
+
32
+ data_model = argv.delete('--pcm') ? :pcm : :pwm
33
+ filename = argv.shift
34
+
35
+ loop do
36
+ begin
37
+ Float(argv.first)
38
+ thresholds << argv.shift.to_f
39
+ rescue
40
+ raise StopIteration
41
+ end
42
+ end
43
+
44
+ raise 'No input. You should specify input file' unless filename
45
+ raise 'You should specify at least one threshold' if thresholds.empty?
46
+
47
+ until argv.empty?
48
+ case argv.shift
49
+ when '-b'
50
+ background = Bioinform::Background.from_string(argv.shift)
51
+ when '-d'
52
+ discretization = argv.shift.to_f
53
+ when '--max-hash-size'
54
+ max_hash_size = argv.shift.to_i
55
+ end
56
+ end
57
+
58
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
59
+ input = File.read(filename)
60
+
61
+ parser = Bioinform::MatrixParser.new
62
+ motif_data = parser.parse!(input)
63
+ case data_model
64
+ when :pcm
65
+ pcm = Bioinform::MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
66
+ pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: background).convert(pcm)
67
+ when :pwm
68
+ pwm = Bioinform::MotifModel::PWM.new(motif_data[:matrix]).named(motif_data[:name])
69
+ end
70
+
71
+ pwm = pwm.discreted(discretization)
72
+ counting = PWMCounting.new(pwm, background: background, max_hash_size: max_hash_size)
73
+
74
+ counts = counting.counts_by_thresholds(* thresholds.map{|count| count * discretization})
75
+ infos = []
76
+ thresholds.each do |threshold|
77
+ count = counts[threshold * discretization]
78
+ pvalue = count.to_f / (counting.vocabulary_volume)
79
+ infos << {threshold: threshold,
80
+ number_of_recognized_words: count,
81
+ pvalue: pvalue}
82
+ end
83
+
84
+ puts Helper.find_pvalue_info_string(infos,
85
+ {discretization: discretization,
86
+ background: background} )
87
+ rescue => err
88
+ $stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
89
+ end
90
+
91
+ end
92
+ end
93
+ end