macroape 3.3.6 → 3.3.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +18 -18
  2. data/Gemfile +4 -4
  3. data/LICENSE +21 -21
  4. data/README.md +69 -67
  5. data/TODO.txt +1 -1
  6. data/benchmark/benchmark_helper.rb +5 -0
  7. data/benchmark/similarity_benchmark.rb +1 -4
  8. data/bin/align_motifs +1 -1
  9. data/bin/eval_alignment +1 -1
  10. data/bin/eval_similarity +1 -1
  11. data/bin/find_pvalue +1 -1
  12. data/bin/find_threshold +1 -1
  13. data/bin/preprocess_collection +1 -1
  14. data/bin/scan_collection +1 -1
  15. data/lib/macroape/aligned_pair_intersection.rb +1 -1
  16. data/lib/macroape/cli/align_motifs.rb +4 -4
  17. data/lib/macroape/cli/eval_alignment.rb +9 -6
  18. data/lib/macroape/cli/eval_similarity.rb +9 -6
  19. data/lib/macroape/cli/find_pvalue.rb +3 -3
  20. data/lib/macroape/cli/find_threshold.rb +3 -3
  21. data/lib/macroape/cli/preprocess_collection.rb +37 -13
  22. data/lib/macroape/cli/scan_collection.rb +28 -21
  23. data/lib/macroape/counting.rb +7 -13
  24. data/lib/macroape/pwm_compare.rb +8 -15
  25. data/lib/macroape/pwm_compare_aligned.rb +11 -38
  26. data/lib/macroape/version.rb +1 -1
  27. data/lib/macroape.rb +6 -7
  28. data/macroape.gemspec +1 -1
  29. data/spec/count_distribution_spec.rb +2 -3
  30. data/spec/spec_helper.rb +2 -2
  31. data/test/align_motifs_test.rb +1 -1
  32. data/test/data/KLF4_f2_scan_results_all.txt +4 -4
  33. data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -4
  34. data/test/data/collection_pcm_without_thresholds.yaml +185 -0
  35. data/test/data/collection_without_thresholds.yaml +185 -0
  36. data/test/data/medium_motif.pat +8 -0
  37. data/test/data/short_motif.pat +7 -0
  38. data/test/data/test_collection.yaml +27 -18
  39. data/test/eval_alignment_test.rb +1 -1
  40. data/test/eval_similarity_test.rb +1 -1
  41. data/test/find_pvalue_test.rb +1 -1
  42. data/test/find_threshold_test.rb +1 -1
  43. data/test/preprocess_collection_test.rb +59 -37
  44. data/test/scan_collection_test.rb +10 -1
  45. data/test/test_helper.rb +13 -9
  46. metadata +15 -6
data/.gitignore CHANGED
@@ -1,18 +1,18 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp
18
- benchmark/
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ benchmark/*.log
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source 'https://rubygems.org'
2
-
3
- # Specify your gem's dependencies in macroape.gemspec
4
- gemspec
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in macroape.gemspec
4
+ gemspec
data/LICENSE CHANGED
@@ -1,22 +1,22 @@
1
- Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
2
-
3
- MIT License
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1
+ Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
22
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,68 +1,70 @@
1
- # Macroape
2
-
3
- Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
4
-
5
- ## Installation
6
-
7
- Add this line to your application's Gemfile:
8
-
9
- gem 'macroape'
10
-
11
- And then execute:
12
-
13
- $ bundle
14
-
15
- Or install it yourself as:
16
-
17
- $ gem install macroape
18
-
19
- ## Usage
20
- For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
21
-
22
- ## Basic usage as a command-line tool
23
- MacroAPE have 7 command line tools:
24
-
25
- ### Tools for calculating thresholds and pvalues:
26
- * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
27
- * find_pvalue \<PWM file\> \<threshold\>
28
-
29
- ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
- * eval_similarity \<first PWM file\> \<second PWM file\>
31
- * eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
32
-
33
- ### Tools for looking through collection for the motifs most similar to a query motif
34
- * preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
35
- * scan_collection \<query PWM file\> \<collection file\>
36
-
37
- ### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
38
- * align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
39
-
40
- Also you can use -h option to print help for a tool in console.
41
- There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
42
- Some of tools also can process PCMs in addition to PWMs.
43
-
44
- ## Basic usage in your code
45
- require 'macroape'
46
- background = [1,1,1,1]
47
- discretization = 10
48
- pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
49
- pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
50
-
51
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
52
- first_threshold = pwm_first.threshold(pvalue)
53
- second_threshold = pwm_second.threshold(pvalue)
54
-
55
- similarity_info = cmp.jaccard(first_threshold, second_threshold)
56
- puts "Jaccard similarity: #{similarity_info[:similarity]}"
57
-
58
- For more details look a source code of utilities in lib/macroape/cli/ folder
59
-
60
- ## Contributing
61
-
62
- 1. Fork it
63
- 2. Create your feature branch (`git checkout -b my-new-feature`)
64
- 3. Commit your changes (`git commit -am 'Added some feature'`)
65
- 4. Push to the branch (`git push origin my-new-feature`)
66
- 5. Create new Pull Request
67
-
1
+ # Macroape
2
+
3
+ Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'macroape'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install macroape
18
+
19
+ ## Usage
20
+ For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
21
+
22
+ ## Basic usage as a command-line tool
23
+ MacroAPE have 7 command line tools:
24
+
25
+ ### Tools for calculating thresholds and pvalues:
26
+ * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
27
+ * find_pvalue \<PWM file\> \<threshold\>
28
+
29
+ ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
+ * eval_similarity \<first PWM file\> \<second PWM file\>
31
+ * eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
32
+
33
+ ### Tools for looking through collection for the motifs most similar to a query motif
34
+ * preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
35
+ * scan_collection \<query PWM file\> \<collection file\>
36
+
37
+ ### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
38
+ * align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
39
+
40
+ Also you can use -h option to print help for a tool in console.
41
+ There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
42
+ Some of tools also can process PCMs in addition to PWMs.
43
+
44
+ ## Basic usage in your code
45
+ require 'macroape'
46
+ background = [1,1,1,1]
47
+ discretization = 10
48
+ pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
49
+ pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
50
+
51
+ cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
52
+ first_threshold = pwm_first.threshold(pvalue)
53
+ second_threshold = pwm_second.threshold(pvalue)
54
+
55
+ similarity_info = cmp.jaccard(first_threshold, second_threshold)
56
+ puts "Jaccard similarity: #{similarity_info[:similarity]}"
57
+
58
+ For more details look a source code of utilities in lib/macroape/cli/ folder
59
+
60
+ ## Contributing
61
+
62
+ 1. Fork it
63
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
64
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
65
+ 4. Push to the branch (`git push origin my-new-feature`)
66
+ 5. Create new Pull Request
67
+
68
+ If you're developing both macroape and bioinform - it may be useful to know that test_helper and spec_helper expands require path in such a way that if you have two "cousin" folders: macroape and bioinform then macroape specs will require bioinform from development folder not from gem. It can save you lots of time not to rebuild-reinstall bioinform gem each time it get some changes
69
+
68
70
  Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
data/TODO.txt CHANGED
@@ -10,7 +10,7 @@ Ideas to increase perfomance:
10
10
  - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
11
11
 
12
12
  Usability issues:
13
- review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)
13
+ make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
14
14
 
15
15
  remove .stdin placeholder. Use tty? method instead
16
16
 
@@ -0,0 +1,5 @@
1
+ $bioinform_folder = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'bioinform', 'lib'))
2
+ $LOAD_PATH.unshift $bioinform_folder
3
+
4
+ require 'benchmark'
5
+ require_relative '../lib/macroape'
@@ -1,7 +1,4 @@
1
- require 'benchmark'
2
-
3
- $:.unshift File.join(File.dirname(__FILE__),'../lib')
4
- require 'macroape'
1
+ require_relative 'benchmark_helper'
5
2
 
6
3
  class TaskToBenchmark
7
4
  def setup
data/bin/align_motifs CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/align_motifs'
3
+ require_relative '../lib/macroape/cli/align_motifs'
4
4
  Macroape::CLI::AlignMotifs.main(ARGV)
data/bin/eval_alignment CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/eval_alignment'
3
+ require_relative '../lib/macroape/cli/eval_alignment'
4
4
  Macroape::CLI::EvalAlignment.main(ARGV)
data/bin/eval_similarity CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/eval_similarity'
3
+ require_relative '../lib/macroape/cli/eval_similarity'
4
4
  Macroape::CLI::EvalSimilarity.main(ARGV)
data/bin/find_pvalue CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/find_pvalue'
3
+ require_relative '../lib/macroape/cli/find_pvalue'
4
4
  Macroape::CLI::FindPValue.main(ARGV)
data/bin/find_threshold CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/find_threshold'
3
+ require_relative '../lib/macroape/cli/find_threshold'
4
4
  Macroape::CLI::FindThreshold.main(ARGV)
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/preprocess_collection'
3
+ require_relative '../lib/macroape/cli/preprocess_collection'
4
4
  Macroape::CLI::PreprocessCollection.main(ARGV)
data/bin/scan_collection CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'macroape/cli/scan_collection'
3
+ require_relative '../lib/macroape/cli/scan_collection'
4
4
  Macroape::CLI::ScanCollection.main(ARGV)
@@ -30,7 +30,7 @@ module Macroape
30
30
  threshold_first - first.best_suffix(column + 1),
31
31
  threshold_second - second.best_suffix(column + 1), &count_contribution_block)
32
32
  scores.replace(new_scores)
33
- if max_hash_size && scores.inject(0){|sum,hsh|sum + hsh.size} > max_hash_size
33
+ if max_pair_hash_size && scores.inject(0){|sum,hsh|sum + hsh.size} > max_pair_hash_size
34
34
  raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
35
35
  end
36
36
  end
@@ -1,4 +1,4 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
 
3
3
  module Macroape
4
4
  module CLI
@@ -15,7 +15,7 @@ module Macroape
15
15
  pwm_3_file shift_3 orientation_3
16
16
  }
17
17
 
18
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
18
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
19
19
  STDERR.puts help_string
20
20
  exit
21
21
  end
@@ -28,9 +28,9 @@ module Macroape
28
28
  pvalue = 0.0005
29
29
 
30
30
  shifts = {leader => [0,:direct]}
31
- pwm_first = data_model.new(File.read(leader)).to_pwm.background!(background).discrete!(discretization)
31
+ pwm_first = data_model.new(File.read(leader)).to_pwm.set_parameters(background: background).discrete!(discretization)
32
32
  argv.each do |motif_name|
33
- pwm_second = data_model.new(File.read(motif_name)).to_pwm.background!(background).discrete!(discretization)
33
+ pwm_second = data_model.new(File.read(motif_name)).to_pwm.set_parameters(background: background).discrete!(discretization)
34
34
  cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
35
35
  info = cmp.jaccard_by_pvalue(pvalue)
36
36
  shifts[motif_name] = [info[:shift], info[:orientation]]
@@ -1,4 +1,4 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
 
3
3
  module Macroape
4
4
  module CLI
@@ -32,7 +32,7 @@ module Macroape
32
32
  cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
33
33
  }
34
34
 
35
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
35
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
36
  STDERR.puts help_string
37
37
  exit
38
38
  end
@@ -91,7 +91,10 @@ module Macroape
91
91
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
92
92
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
93
93
 
94
- parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
94
+ if first_file == '.stdin' || second_file == '.stdin'
95
+ input = $stdin.read
96
+ parser = data_model.choose_parser(input).new(input)
97
+ end
95
98
 
96
99
  if first_file == '.stdin'
97
100
  input_first = parser.parse
@@ -109,10 +112,10 @@ module Macroape
109
112
  end
110
113
  pwm_second = data_model.new(input_second).to_pwm
111
114
 
112
- pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
113
- pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
115
+ pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
116
+ pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
114
117
 
115
- cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).max_hash_size(max_pair_hash_size)
118
+ cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
116
119
 
117
120
  info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
118
121
 
@@ -1,4 +1,4 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
 
3
3
  module Macroape
4
4
  module CLI
@@ -33,7 +33,7 @@ module Macroape
33
33
  cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
34
34
  }
35
35
 
36
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
37
37
  STDERR.puts help_string
38
38
  exit
39
39
  end
@@ -73,7 +73,10 @@ module Macroape
73
73
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
74
74
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
75
75
 
76
- parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
76
+ if first_file == '.stdin' || second_file == '.stdin'
77
+ input = $stdin.read
78
+ parser = data_model.choose_parser(input).new(input)
79
+ end
77
80
 
78
81
  if first_file == '.stdin'
79
82
  input_first = parser.parse
@@ -91,10 +94,10 @@ module Macroape
91
94
  end
92
95
  pwm_second = data_model.new(input_second).to_pwm
93
96
 
94
- pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
95
- pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
97
+ pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
98
+ pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
96
99
 
97
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).max_hash_size(max_pair_hash_size)
100
+ cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
98
101
 
99
102
  info = cmp.jaccard_by_pvalue(pvalue)
100
103
 
@@ -1,4 +1,4 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
 
3
3
  module Macroape
4
4
  module CLI
@@ -31,7 +31,7 @@ module Macroape
31
31
  cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
32
32
  }
33
33
 
34
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
34
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
35
35
  STDERR.puts help_string
36
36
  exit
37
37
  end
@@ -75,7 +75,7 @@ module Macroape
75
75
  input = File.read(filename)
76
76
  end
77
77
  pwm = data_model.new(input).to_pwm
78
- pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
78
+ pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
79
79
 
80
80
  counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
81
81
  pvalues = counts.map{|count| count.to_f / pwm.vocabulary_volume}
@@ -1,4 +1,4 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
 
3
3
  module Macroape
4
4
  module CLI
@@ -27,7 +27,7 @@ module Macroape
27
27
  ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
28
28
  }
29
29
 
30
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
30
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
31
31
  STDERR.puts help_string
32
32
  exit
33
33
  end
@@ -70,7 +70,7 @@ module Macroape
70
70
  input = File.read(filename)
71
71
  end
72
72
  pwm = data_model.new(input).to_pwm
73
- pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
73
+ pwm.set_parameters(background: background, max_hash_size: max_hash_size).discrete!(discretization)
74
74
 
75
75
  pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
76
76
  puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
@@ -1,5 +1,6 @@
1
- require 'macroape'
1
+ require_relative '../../macroape'
2
2
  require 'yaml'
3
+ require 'shellwords'
3
4
 
4
5
  module Macroape
5
6
  module CLI
@@ -8,13 +9,14 @@ module Macroape
8
9
  def self.main(argv)
9
10
  help_string = %q{
10
11
  Command-line format:
11
- ruby preprocess_collection.rb <file or folder with PWMs or .stdin with PWMs> [options]
12
+ ruby preprocess_collection.rb <file or folder with PWMs or .stdin with filenames> [options]
12
13
 
13
14
  Options:
14
15
  [-p <list of P-values>]
15
16
  [-d <rough discretization> <precise discretization>]
16
17
  [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
17
18
  [-o <output file>]
19
+ [-n <name>] - specify name for a collection. Default filename is based on this parameter
18
20
  [--silent] - don't show current progress information during scan (by default this information's written into stderr)
19
21
  [--pcm] - treats your input motifs as PCM-s. Motifs are converted to PWMs internally so output is the same as for according PWMs
20
22
 
@@ -24,7 +26,7 @@ module Macroape
24
26
  ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
25
27
  }
26
28
 
27
- if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
29
+ if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
28
30
  STDERR.puts help_string
29
31
  exit
30
32
  end
@@ -35,7 +37,7 @@ module Macroape
35
37
  background = [1,1,1,1]
36
38
  rough_discretization = 1
37
39
  precise_discretization = 10
38
- output_file = 'collection.yaml'
40
+ output_file = 'collection.yaml'
39
41
  max_hash_size = 1000000
40
42
 
41
43
  data_source = argv.shift
@@ -45,6 +47,7 @@ module Macroape
45
47
 
46
48
  pvalues = []
47
49
  silent = false
50
+ output_file_specified = false
48
51
  until argv.empty?
49
52
  case argv.shift
50
53
  when '-b'
@@ -63,8 +66,11 @@ module Macroape
63
66
  rough_discretization, precise_discretization = argv.shift(2).map(&:to_f).sort
64
67
  when '-o'
65
68
  output_file = argv.shift
69
+ output_file_specified = true
66
70
  when '-m'
67
71
  max_hash_size = argv.shift.to_i
72
+ when '-n'
73
+ collection_name = argv.shift
68
74
  when '--silent'
69
75
  silent = true
70
76
  end
@@ -75,6 +81,10 @@ module Macroape
75
81
  precise_discretization: precise_discretization,
76
82
  background: background,
77
83
  pvalues: pvalues)
84
+ if collection_name
85
+ collection.name = collection_name
86
+ output_file = "#{collection_name}.yaml" if !output_file_specified
87
+ end
78
88
 
79
89
  if File.directory?(data_source)
80
90
  motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename|
@@ -84,10 +94,15 @@ module Macroape
84
94
  end
85
95
  elsif File.file?(data_source)
86
96
  input = File.read(data_source)
87
- motifs = data_model.choose_parser(input).split_on_motifs(input, data_model)
97
+ motifs = data_model.split_on_motifs(input)
88
98
  elsif data_source == '.stdin'
89
- input = $stdin.read
90
- motifs = data_model.choose_parser(input).split_on_motifs(input, data_model)
99
+ filelist = $stdin.read.shellsplit
100
+ motifs = []
101
+ filelist.each do |filename|
102
+ motif = data_model.new(File.read(filename))
103
+ motif.name ||= File.basename(filename, File.extname(filename))
104
+ motifs << motif
105
+ end
91
106
  else
92
107
  raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin"
93
108
  end
@@ -102,19 +117,28 @@ module Macroape
102
117
  # Also two command line options to fail on skipping or to skip silently should be included
103
118
 
104
119
  info = OpenStruct.new(rough: {}, precise: {})
105
- pwm.background!(background).max_hash_size!(max_hash_size)
120
+ pwm.set_parameters(background: background, max_hash_size: max_hash_size)
121
+ skip_motif = false
106
122
 
107
123
  pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
108
- info.rough[pvalue] = threshold / rough_discretization
124
+ if real_pvalue == 0
125
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped"
126
+ else
127
+ info.rough[pvalue] = threshold / rough_discretization
128
+ end
109
129
  end
110
130
 
111
131
  pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
112
- info.precise[pvalue] = threshold / precise_discretization
132
+ if real_pvalue == 0
133
+ $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection"
134
+ skip_motif = true
135
+ else
136
+ info.precise[pvalue] = threshold / precise_discretization
137
+ end
113
138
  end
114
-
115
- collection.add_pm(pwm, info)
139
+ collection.add_pm(pwm, info) unless skip_motif
116
140
  end
117
- File.open(output_file,'w') do |f|
141
+ File.open(output_file, 'w') do |f|
118
142
  f.puts(collection.to_yaml)
119
143
  end
120
144
  rescue => err