macroape 3.3.3 → 3.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/.gitignore +1 -0
  2. data/Rakefile.rb +7 -22
  3. data/TODO.txt +7 -6
  4. data/bin/align_motifs +4 -0
  5. data/bin/eval_alignment +2 -1
  6. data/bin/eval_similarity +2 -1
  7. data/bin/find_pvalue +2 -1
  8. data/bin/find_threshold +2 -1
  9. data/bin/preprocess_collection +2 -1
  10. data/bin/scan_collection +2 -1
  11. data/lib/macroape/aligned_pair_intersection.rb +2 -3
  12. data/lib/macroape/cli/align_motifs.rb +49 -0
  13. data/lib/macroape/cli/eval_alignment.rb +124 -0
  14. data/lib/macroape/cli/eval_similarity.rb +107 -0
  15. data/lib/macroape/cli/find_pvalue.rb +89 -0
  16. data/lib/macroape/cli/find_threshold.rb +84 -0
  17. data/lib/macroape/cli/preprocess_collection.rb +123 -0
  18. data/lib/macroape/cli/scan_collection.rb +141 -0
  19. data/lib/macroape/cli.rb +5 -0
  20. data/lib/macroape/counting.rb +15 -1
  21. data/lib/macroape/pwm_compare.rb +21 -1
  22. data/lib/macroape/pwm_compare_aligned.rb +21 -0
  23. data/lib/macroape/version.rb +1 -1
  24. data/macroape.gemspec +1 -1
  25. data/test/align_motifs_test.rb +12 -0
  26. data/test/data/KLF3_f1.pat +16 -0
  27. data/test/data/KLF3_f1.pcm +16 -0
  28. data/test/data/KLF4_f2.pcm +11 -0
  29. data/test/data/SP1_f1.pat +11 -11
  30. data/test/data/SP1_f1.pcm +12 -0
  31. data/test/data/SP1_f1_revcomp.pat +11 -11
  32. data/test/data/SP1_f1_revcomp.pcm +12 -0
  33. data/test/data/test_collection/SP1_f1.pat +11 -11
  34. data/test/data/test_collection.yaml +49 -109
  35. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -0
  36. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -0
  37. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -0
  38. data/test/data/test_collection_single_file.txt +38 -0
  39. data/test/data/test_collection_single_file_pcm.txt +38 -0
  40. data/test/eval_alignment_test.rb +31 -0
  41. data/test/eval_similarity_test.rb +28 -13
  42. data/test/find_pvalue_test.rb +10 -13
  43. data/test/find_threshold_test.rb +10 -5
  44. data/test/preprocess_collection_test.rb +36 -2
  45. data/test/scan_collection_test.rb +9 -4
  46. data/test/test_helper.rb +61 -2
  47. metadata +38 -12
  48. data/lib/macroape/exec/eval_alignment.rb +0 -125
  49. data/lib/macroape/exec/eval_similarity.rb +0 -108
  50. data/lib/macroape/exec/find_pvalue.rb +0 -81
  51. data/lib/macroape/exec/find_threshold.rb +0 -77
  52. data/lib/macroape/exec/preprocess_collection.rb +0 -101
  53. data/lib/macroape/exec/scan_collection.rb +0 -124
  54. data/test/eval_alignment_similarity_test.rb +0 -20
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ benchmark/
data/Rakefile.rb CHANGED
@@ -1,34 +1,19 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
3
  require 'rspec/core/rake_task'
4
+ require 'rake/testtask'
4
5
 
5
6
  namespace :spec do
6
- task :find_threshold do
7
- system("ruby -I ./test test/find_threshold_test.rb")
8
- end
9
- task :find_pvalue do
10
- system("ruby -I ./test test/find_pvalue_test.rb")
11
- end
12
- task :eval_similarity do
13
- system("ruby -I ./test test/eval_similarity_test.rb")
14
- end
15
- task :eval_alignment_similarity do
16
- system("ruby -I ./test test/eval_alignment_similarity_test.rb")
17
- end
18
- task :preprocess_collection do
19
- system("ruby -I ./test test/preprocess_collection_test.rb")
20
- end
21
- task :scan_collection do
22
- system("ruby -I ./test test/scan_collection_test.rb")
23
- end
24
- task :tests => [:find_threshold, :find_pvalue, :eval_similarity,
25
- :eval_alignment_similarity, :scan_collection, :preprocess_collection]
26
-
7
+ Rake::TestTask.new do |t|
8
+ t.libs << "test"
9
+ t.test_files = FileList['test/*_test.rb']
10
+ t.verbose = true
11
+ end
27
12
  RSpec::Core::RakeTask.new
28
13
  end
29
14
 
30
15
  desc 'Test all functionality of gem executables'
31
- task :spec => ['spec:tests', 'spec:spec']
16
+ task :spec => ['spec:test', 'spec:spec']
32
17
 
33
18
  namespace :benchmark do
34
19
  task :run do
data/TODO.txt CHANGED
@@ -1,16 +1,13 @@
1
1
  Absolutely necessary:
2
- Repair obtaining matrix not only from files but from stdin
3
- Make it available to load collections in preprocess_collection from single file (and from stdin of certainly)
4
- Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
2
+ (already work in preprocess_colleсtion and align_motifs) Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
5
3
 
6
4
  Specs and tests:
7
5
  create spec on use of MaxHashSize, MaxHashSizeDouble
8
6
  create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
9
- create test for getting PWMs from stdin
10
7
  create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
11
8
 
12
- Ideas to inctrease perfomance:
13
- - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions
9
+ Ideas to increase perfomance:
10
+ - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions (??! it can significantly obscure code because thresholds will be changed too, and I can't tell what is better: slight perfomance optimization or conciseness of code)
14
11
  - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
15
12
  - Create JAVA extension for alignment_intersection methods in order to increase perfomance
16
13
  - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
@@ -18,3 +15,7 @@ Ideas to inctrease perfomance:
18
15
  Usability issues:
19
16
  review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)
20
17
 
18
+ remove .stdin placeholder. Use tty? method instead
19
+
20
+ use OptionParser (??? can OptionParser get stub ARGV ???)
21
+ make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
data/bin/align_motifs ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/cli/align_motifs'
4
+ Macroape::CLI::AlignMotifs.main(ARGV)
data/bin/eval_alignment CHANGED
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/eval_alignment.rb'
3
+ require 'macroape/cli/eval_alignment'
4
+ Macroape::CLI::EvalAlignment.main(ARGV)
data/bin/eval_similarity CHANGED
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/eval_similarity.rb'
3
+ require 'macroape/cli/eval_similarity'
4
+ Macroape::CLI::EvalSimilarity.main(ARGV)
data/bin/find_pvalue CHANGED
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/find_pvalue.rb'
3
+ require 'macroape/cli/find_pvalue'
4
+ Macroape::CLI::FindPValue.main(ARGV)
data/bin/find_threshold CHANGED
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/find_threshold.rb'
3
+ require 'macroape/cli/find_threshold'
4
+ Macroape::CLI::FindThreshold.main(ARGV)
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/preprocess_collection.rb'
3
+ require 'macroape/cli/preprocess_collection'
4
+ Macroape::CLI::PreprocessCollection.main(ARGV)
data/bin/scan_collection CHANGED
@@ -1,3 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/exec/scan_collection.rb'
3
+ require 'macroape/cli/scan_collection'
4
+ Macroape::CLI::ScanCollection.main(ARGV)
@@ -1,6 +1,5 @@
1
1
  module Macroape
2
2
  class PWMCompareAligned
3
-
4
3
  # unoptimized version of this and related methods
5
4
  def counts_for_two_matrices(threshold_first, threshold_second)
6
5
  # just not to call method each time
@@ -27,11 +26,11 @@ module Macroape
27
26
  scores = { 0 => {0 => 1} }
28
27
  length.times do |column|
29
28
  new_scores = recalc_score_hash(scores,
30
- @first.matrix[column], @second.matrix[column],
29
+ first.matrix[column], second.matrix[column],
31
30
  threshold_first - first.best_suffix(column + 1),
32
31
  threshold_second - second.best_suffix(column + 1), &count_contribution_block)
33
32
  scores.replace(new_scores)
34
- if defined?(MaxHashSizeDouble) && scores.inject(0){|sum,hsh|sum + hsh.size} > MaxHashSizeDouble
33
+ if max_hash_size && scores.inject(0){|sum,hsh|sum + hsh.size} > max_hash_size
35
34
  raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
36
35
  end
37
36
  end
@@ -0,0 +1,49 @@
1
+ require 'macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module AlignMotifs
6
+
7
+ def self.main(argv)
8
+ help_string = %q{
9
+ Usage:
10
+ ruby align_motifs pwm1_file pwm2_file pwm3_file
11
+ ruby align_motifs pcm1_file pcm2_file pcm3_file --pcm
12
+ Output:
13
+ pwm_1_file shift_1 orientation_1
14
+ pwm_2_file shift_2 orientation_2
15
+ pwm_3_file shift_3 orientation_3
16
+ }
17
+
18
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
19
+ STDERR.puts help_string
20
+ exit
21
+ end
22
+
23
+
24
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
25
+ leader = argv.shift
26
+ background = [1,1,1,1]
27
+ discretization = 10
28
+ pvalue = 0.0005
29
+
30
+ shifts = {leader => [0,:direct]}
31
+ pwm_first = data_model.new(File.read(leader)).to_pwm.background!(background).discrete!(discretization)
32
+ argv.each do |motif_name|
33
+ pwm_second = data_model.new(File.read(motif_name)).to_pwm.background!(background).discrete!(discretization)
34
+ cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
35
+ info = cmp.jaccard_by_pvalue(pvalue)
36
+ shifts[motif_name] = [info[:shift], info[:orientation]]
37
+ end
38
+
39
+ shifts.each do |motif_name, (shift,orientation)|
40
+ puts "#{motif_name}\t#{shift}\t#{orientation}"
41
+ end
42
+
43
+ rescue => err
44
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,124 @@
1
+ require 'macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module EvalAlignment
6
+
7
+ def self.main(argv)
8
+ help_string = %q{
9
+ Command-line format:
10
+ ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
11
+ type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
12
+ or in linux
13
+ cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
14
+
15
+ Options:
16
+ [-p <P-value>]
17
+ [-d <discretization level>]
18
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
19
+
20
+ Output format:
21
+ <jaccard similarity coefficient>
22
+ <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
23
+ <aligned 1st matrix>
24
+ <aligned 2nd matrix>
25
+ <shift> <orientation>
26
+
27
+ Examples:
28
+ ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
29
+ or on windows
30
+ type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
31
+ or in linux
32
+ cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
33
+ }
34
+
35
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
+ STDERR.puts help_string
37
+ exit
38
+ end
39
+
40
+ pvalue = 0.0005
41
+ discretization = 10
42
+
43
+ first_background = [1,1,1,1]
44
+ second_background = [1,1,1,1]
45
+ max_hash_size = 1000000
46
+ max_pair_hash_size = 1000
47
+
48
+
49
+ first_file = argv.shift
50
+ second_file = argv.shift
51
+
52
+ shift = argv.shift
53
+ orientation = argv.shift
54
+
55
+ raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
56
+ raise 'You\'d specify shift' unless shift
57
+ raise 'You\'d specify orientation' unless orientation
58
+
59
+ shift = shift.to_i
60
+ orientation = orientation.to_sym
61
+
62
+ case orientation
63
+ when :direct
64
+ reverse = false
65
+ when :revcomp
66
+ reverse = true
67
+ else
68
+ raise 'Unknown orientation(direct/revcomp)'
69
+ end
70
+
71
+
72
+ until argv.empty?
73
+ case argv.shift
74
+ when '-p'
75
+ pvalue = argv.shift.to_f
76
+ when '-d'
77
+ discretization = argv.shift.to_f
78
+ when '-m'
79
+ max_hash_size = argv.shift.to_i
80
+ when '-md'
81
+ max_pair_hash_size = argv.shift.to_i
82
+ when '-b'
83
+ second_background = first_background = argv.shift(4).map(&:to_f)
84
+ when '-b1'
85
+ first_background = argv.shift(4).map(&:to_f)
86
+ when '-b2'
87
+ second_background = argv.shift(4).map(&:to_f)
88
+ end
89
+ end
90
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
91
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
92
+
93
+ parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
94
+
95
+ if first_file == '.stdin'
96
+ pwm_first = Bioinform::PWM.new( parser.parse )
97
+ else
98
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
99
+ pwm_first = Bioinform::PWM.new(File.read(first_file))
100
+ end
101
+
102
+ if second_file == '.stdin'
103
+ pwm_second = Bioinform::PWM.new( parser.parse )
104
+ else
105
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
106
+ pwm_second = Bioinform::PWM.new(File.read(second_file))
107
+ end
108
+
109
+ pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
110
+ pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
111
+
112
+ cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).max_hash_size(max_pair_hash_size)
113
+
114
+ info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
115
+
116
+ puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
117
+
118
+ rescue => err
119
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
120
+ end
121
+
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,107 @@
1
+ require 'macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module EvalSimilarity
6
+
7
+ def self.main(argv)
8
+ help_string = %q{
9
+ Command-line format:
10
+ ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
11
+ or on windows
12
+ type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
13
+ or in linux
14
+ cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
15
+
16
+ Options:
17
+ [-p <P-value>]
18
+ [-d <discretization level>]
19
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
20
+
21
+ Output has format:
22
+ <jaccard similarity coefficient>
23
+ <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
24
+ <optimal alignment, the 1st matrix>
25
+ <optimal alignment, the 2nd matrix>
26
+ <shift> <orientation>
27
+
28
+ Examples:
29
+ ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
30
+ or on windows
31
+ type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
32
+ or in linux
33
+ cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
34
+ }
35
+
36
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
37
+ STDERR.puts help_string
38
+ exit
39
+ end
40
+
41
+ pvalue = 0.0005
42
+ discretization = 10
43
+
44
+ first_background = [1,1,1,1]
45
+ second_background = [1,1,1,1]
46
+
47
+ max_hash_size = 1000000
48
+ max_pair_hash_size = 1000
49
+
50
+
51
+ first_file = argv.shift
52
+ second_file = argv.shift
53
+ raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
54
+
55
+ until argv.empty?
56
+ case argv.shift
57
+ when '-p'
58
+ pvalue = argv.shift.to_f
59
+ when '-d'
60
+ discretization = argv.shift.to_f
61
+ when '-m'
62
+ max_hash_size = argv.shift.to_i
63
+ when '-md'
64
+ max_pair_hash_size = argv.shift.to_i
65
+ when '-b'
66
+ second_background = first_background = argv.shift(4).map(&:to_f)
67
+ when '-b1'
68
+ first_background = argv.shift(4).map(&:to_f)
69
+ when '-b2'
70
+ second_background = argv.shift(4).map(&:to_f)
71
+ end
72
+ end
73
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
74
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
75
+
76
+ parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
77
+
78
+ if first_file == '.stdin'
79
+ pwm_first = Bioinform::PWM.new( parser.parse )
80
+ else
81
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
82
+ pwm_first = Bioinform::PWM.new(File.read(first_file))
83
+ end
84
+
85
+ if second_file == '.stdin'
86
+ pwm_second = Bioinform::PWM.new( parser.parse )
87
+ else
88
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
89
+ pwm_second = Bioinform::PWM.new(File.read(second_file))
90
+ end
91
+
92
+ pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
93
+ pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
94
+
95
+ cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).max_hash_size(max_pair_hash_size)
96
+
97
+ info = cmp.jaccard_by_pvalue(pvalue)
98
+
99
+ puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
100
+
101
+ rescue => err
102
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
103
+ end
104
+
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,89 @@
1
+ require 'macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindPValue
6
+
7
+ def self.main(argv)
8
+ help_string = %q{
9
+ Command-line format:
10
+ ruby find_pvalue.rb <pat-file> <threshold list> [options]
11
+ or in linux
12
+ cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
13
+ or on windows
14
+ type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
15
+
16
+ Options:
17
+ [-d <discretization level>]
18
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
19
+
20
+ Output format:
21
+ threshold_1 count_1 pvalue_1
22
+ threshold_2 count_2 pvalue_2
23
+ threshold_3 count_3 pvalue_3
24
+ The results are printed out in the same order as in the given threshold list.
25
+
26
+ Examples:
27
+ ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
28
+ or on windows
29
+ type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
30
+ or in linux
31
+ cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
32
+ }
33
+
34
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
35
+ STDERR.puts help_string
36
+ exit
37
+ end
38
+
39
+ discretization = 10000
40
+ background = [1,1,1,1]
41
+ thresholds = []
42
+ max_hash_size = 1000000
43
+
44
+ filename = argv.shift
45
+
46
+ loop do
47
+ begin
48
+ Float(argv.first)
49
+ thresholds << argv.shift.to_f
50
+ rescue
51
+ raise StopIteration
52
+ end
53
+ end
54
+
55
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
56
+ raise 'You should specify at least one threshold' if thresholds.empty?
57
+
58
+ until argv.empty?
59
+ case argv.shift
60
+ when '-b'
61
+ background = argv.shift(4).map(&:to_f)
62
+ when '-d'
63
+ discretization = argv.shift.to_f
64
+ when '-m'
65
+ max_hash_size = argv.shift.to_i
66
+ end
67
+ end
68
+
69
+
70
+ if filename == '.stdin'
71
+ pwm = Bioinform::PWM.new( $stdin.read )
72
+ else
73
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
74
+ pwm = Bioinform::PWM.new( File.read(filename) )
75
+ end
76
+ pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
77
+
78
+ counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
79
+ pvalues = counts.map{|count| count.to_f / pwm.vocabulary_volume}
80
+ pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
81
+ puts "#{threshold}\t#{count}\t#{pvalue}"
82
+ }
83
+ rescue => err
84
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
85
+ end
86
+
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,84 @@
1
+ require 'macroape'
2
+
3
+ module Macroape
4
+ module CLI
5
+ module FindThreshold
6
+
7
+ def self.main(argv)
8
+ help_string = %q{
9
+ Command-line format::
10
+ ruby find_threshold.rb <pat-file> [options]
11
+ or in linux
12
+ cat <pat-file> | ruby find_threshold.rb .stdin [options]
13
+ or on windows
14
+ type <pat-file> | ruby find_threshold.rb .stdin [options]
15
+
16
+ Options:
17
+ [-p <list of P-values>]
18
+ [-d <discretization level>]
19
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
20
+
21
+ Output format:
22
+ requested_pvalue_1 threshold_1 achieved_pvalue_1
23
+ requested_pvalue_2 threshold_2 achieved_pvalue_2
24
+
25
+
26
+ Example:
27
+ ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
28
+ }
29
+
30
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
31
+ STDERR.puts help_string
32
+ exit
33
+ end
34
+
35
+ background = [1,1,1,1]
36
+ default_pvalues = [0.0005]
37
+ discretization = 10000
38
+ max_hash_size = 1000000
39
+
40
+ filename = argv.shift
41
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
42
+
43
+ pvalues = []
44
+ until argv.empty?
45
+ case argv.shift
46
+ when '-b'
47
+ background = argv.shift(4).map(&:to_f)
48
+ when '-m'
49
+ max_hash_size = argv.shift.to_i
50
+ when '-p'
51
+ loop do
52
+ begin
53
+ Float(argv.first)
54
+ pvalues << argv.shift.to_f
55
+ rescue
56
+ raise StopIteration
57
+ end
58
+ end
59
+ when '-d'
60
+ discretization = argv.shift.to_f
61
+ end
62
+ end
63
+ pvalues = default_pvalues if pvalues.empty?
64
+
65
+ if filename == '.stdin'
66
+ pwm = Bioinform::PWM.new( $stdin.read )
67
+ else
68
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
+ pwm = Bioinform::PWM.new( File.read(filename) )
70
+ end
71
+
72
+ pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
73
+
74
+ pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
75
+ puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
76
+ end
77
+
78
+ rescue => err
79
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
80
+ end
81
+
82
+ end
83
+ end
84
+ end