macroape 4.0.2 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -17
  3. data/Gemfile +4 -4
  4. data/LICENSE +22 -22
  5. data/README.md +70 -70
  6. data/Rakefile.rb +49 -49
  7. data/TODO.txt +46 -46
  8. data/benchmark/benchmark_helper.rb +4 -4
  9. data/benchmark/similarity_benchmark.rb +52 -52
  10. data/bin/align_motifs +4 -4
  11. data/bin/eval_alignment +4 -4
  12. data/bin/eval_similarity +4 -4
  13. data/bin/find_pvalue +4 -4
  14. data/bin/find_threshold +4 -4
  15. data/bin/preprocess_collection +4 -4
  16. data/bin/scan_collection +4 -4
  17. data/lib/macroape.rb +14 -11
  18. data/lib/macroape/aligned_pair_intersection.rb +61 -62
  19. data/lib/macroape/cli.rb +191 -188
  20. data/lib/macroape/cli/align_motifs.rb +120 -100
  21. data/lib/macroape/cli/eval_alignment.rb +157 -156
  22. data/lib/macroape/cli/eval_similarity.rb +138 -137
  23. data/lib/macroape/cli/find_pvalue.rb +93 -87
  24. data/lib/macroape/cli/find_threshold.rb +103 -96
  25. data/lib/macroape/cli/preprocess_collection.rb +169 -161
  26. data/lib/macroape/cli/scan_collection.rb +171 -163
  27. data/lib/macroape/collection.rb +29 -0
  28. data/lib/macroape/motif_with_thresholds.rb +18 -0
  29. data/lib/macroape/pwm_compare.rb +39 -44
  30. data/lib/macroape/pwm_compare_aligned.rb +139 -130
  31. data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
  32. data/lib/macroape/support/inverf.rb +13 -0
  33. data/lib/macroape/support/partial_sums.rb +17 -0
  34. data/lib/macroape/version.rb +4 -4
  35. data/macroape.gemspec +19 -19
  36. data/spec/count_distribution_spec.rb +112 -109
  37. data/spec/inverf_spec.rb +23 -0
  38. data/spec/partial_sums_spec.rb +28 -0
  39. data/spec/spec_helper.rb +11 -11
  40. data/test/align_motifs_test.rb +42 -43
  41. data/test/data/AHR_si.pwm +10 -10
  42. data/test/data/KLF3_f1.pcm +16 -16
  43. data/test/data/KLF3_f1.pwm +16 -16
  44. data/test/data/KLF4_f2.pcm +11 -11
  45. data/test/data/KLF4_f2.pwm +11 -11
  46. data/test/data/KLF4_f2_scan_results_all.txt +2 -2
  47. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
  48. data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
  49. data/test/data/SP1_f1.pcm +12 -12
  50. data/test/data/SP1_f1.pwm +12 -12
  51. data/test/data/SP1_f1_revcomp.pcm +12 -12
  52. data/test/data/SP1_f1_revcomp.pwm +12 -12
  53. data/test/data/medium_motif.pwm +8 -8
  54. data/test/data/short_motif.pwm +7 -7
  55. data/test/data/test_collection.yaml +231 -214
  56. data/test/data/test_collection/GABPA_f1.pwm +14 -14
  57. data/test/data/test_collection/KLF4_f2.pwm +10 -10
  58. data/test/data/test_collection/SP1_f1.pwm +12 -12
  59. data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
  60. data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
  61. data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
  62. data/test/data/test_collection_single_file.txt +38 -38
  63. data/test/data/test_collection_single_file_pcm.txt +37 -37
  64. data/test/data/test_collection_weak.yaml +231 -214
  65. data/test/eval_alignment_test.rb +90 -111
  66. data/test/eval_similarity_test.rb +105 -123
  67. data/test/find_pvalue_test.rb +34 -39
  68. data/test/find_threshold_test.rb +87 -91
  69. data/test/preprocess_collection_test.rb +56 -65
  70. data/test/scan_collection_test.rb +42 -48
  71. data/test/test_helper.rb +159 -160
  72. metadata +14 -10
  73. data/test/data/collection_pcm_without_thresholds.yaml +0 -188
  74. data/test/data/collection_without_thresholds.yaml +0 -188
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fe425b463f7ed42c64027a15d68ee85a38334058
4
- data.tar.gz: f9789653cf1708f00eb9255a24c4c009c352f3a6
3
+ metadata.gz: c48f7766729e453bb4da35c7fa4c9792cc481f69
4
+ data.tar.gz: da9e81f90fa159d7f64f9fa49ca4d045fc53eef6
5
5
  SHA512:
6
- metadata.gz: a09caa08fe447ef672ad91793151f0d1a2fef245141e86f5f54e467d881e32863a0e76502e7345297d702c9115d1b721e365a1c6e314e5cd5b55920e0f04a585
7
- data.tar.gz: fcdf7a5724d1a8ad1d589a5d1d9ec892d780df3d05234af09e6ee546a486a95f4c833317ec2392fae65c55dd406b30d7e71e0eba9c09833edd86c4d489817679
6
+ metadata.gz: cae11debfcc446886bf04bfb8f8f30a4035becedf7720606a532ec46d4007f3437a574c7294835247030f7f2dd4f374a7396adc5709b4f0845612b6e1e7ae5a2
7
+ data.tar.gz: dc2b3811fa99aa6a6a06cb57fd7ded315db4cdece6c17729093aa03d21f6bbd248ed7e5234085e581a8131693e0589cfa2bc5a90fc5452875dad9fb95d89f298
data/.gitignore CHANGED
@@ -1,18 +1,18 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
18
  benchmark/*.log
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source 'https://rubygems.org'
2
-
3
- # Specify your gem's dependencies in macroape.gemspec
4
- gemspec
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in macroape.gemspec
4
+ gemspec
data/LICENSE CHANGED
@@ -1,22 +1,22 @@
1
- Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
2
-
3
- MIT License
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1
+ Copyright (c) 2011-2014 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,70 +1,70 @@
1
- # Macroape
2
-
3
- Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
4
-
5
- ## Installation
6
-
7
- Add this line to your application's Gemfile:
8
-
9
- gem 'macroape'
10
-
11
- And then execute:
12
-
13
- $ bundle
14
-
15
- Or install it yourself as:
16
-
17
- $ gem install macroape
18
-
19
- ## Usage
20
- For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
21
-
22
- ## Basic usage as a command-line tool
23
- MacroAPE have 7 command line tools:
24
-
25
- ### Tools for calculating thresholds and pvalues:
26
- * find_threshold \<PWM file\> [\<pvalue(by default: 0.0005)\>...]
27
- * find_pvalue \<PWM file\> \<threshold\>...
28
-
29
- ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
- * eval_similarity \<first PWM file\> \<second PWM file\>
31
- * eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
32
-
33
- ### Tools for looking through collection for the motifs most similar to a query motif
34
- * preprocess_collection \<folder with motif files\> \<collection output file\>
35
- * scan_collection \<query PWM file\> \<collection file\>
36
-
37
- ### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
38
- * align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
39
-
40
- Also you can use -h option to print help for a tool in console.
41
- There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
42
- Some of tools also can process PCMs in addition to PWMs.
43
-
44
- ## Basic usage in your code
45
- require 'macroape'
46
- background = [1,1,1,1]
47
- discretization = 10
48
- pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
49
- pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
50
-
51
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
52
- first_threshold = pwm_first.threshold(pvalue)
53
- second_threshold = pwm_second.threshold(pvalue)
54
-
55
- similarity_info = cmp.jaccard(first_threshold, second_threshold)
56
- puts "Jaccard similarity: #{similarity_info[:similarity]}"
57
-
58
- For more details look a source code of utilities in lib/macroape/cli/ folder
59
-
60
- ## Contributing
61
-
62
- 1. Fork it
63
- 2. Create your feature branch (`git checkout -b my-new-feature`)
64
- 3. Commit your changes (`git commit -am 'Added some feature'`)
65
- 4. Push to the branch (`git push origin my-new-feature`)
66
- 5. Create new Pull Request
67
-
68
- If you're developing both macroape and bioinform - it may be useful to know that test_helper and spec_helper expands require path in such a way that if you have two "cousin" folders: macroape and bioinform then macroape specs will require bioinform from development folder not from gem. It can save you lots of time not to rebuild-reinstall bioinform gem each time it get some changes
69
-
70
- Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
1
+ # Macroape
2
+
3
+ Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'macroape'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install macroape
18
+
19
+ ## Usage
20
+ For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
21
+
22
+ ## Basic usage as a command-line tool
23
+ MacroAPE have 7 command line tools:
24
+
25
+ ### Tools for calculating thresholds and pvalues:
26
+ * find_threshold \<PWM file\> [\<pvalue(by default: 0.0005)\>...]
27
+ * find_pvalue \<PWM file\> \<threshold\>...
28
+
29
+ ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
+ * eval_similarity \<first PWM file\> \<second PWM file\>
31
+ * eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
32
+
33
+ ### Tools for looking through collection for the motifs most similar to a query motif
34
+ * preprocess_collection \<folder with motif files\> \<collection output file\>
35
+ * scan_collection \<query PWM file\> \<collection file\>
36
+
37
+ ### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
38
+ * align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
39
+
40
+ Also you can use -h option to print help for a tool in console.
41
+ There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
42
+ Some of tools also can process PCMs in addition to PWMs.
43
+
44
+ ## Basic usage in your code
45
+ require 'macroape'
46
+ background = [1,1,1,1]
47
+ discretization = 10
48
+ pwm_first = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
49
+ pwm_second = Bioinform::PWM.new(File.read('first_pwm.pat')).background(background).discrete(discretization)
50
+
51
+ cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
52
+ first_threshold = pwm_first.threshold(pvalue)
53
+ second_threshold = pwm_second.threshold(pvalue)
54
+
55
+ similarity_info = cmp.jaccard(first_threshold, second_threshold)
56
+ puts "Jaccard similarity: #{similarity_info[:similarity]}"
57
+
58
+ For more details look a source code of utilities in lib/macroape/cli/ folder
59
+
60
+ ## Contributing
61
+
62
+ 1. Fork it
63
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
64
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
65
+ 4. Push to the branch (`git push origin my-new-feature`)
66
+ 5. Create new Pull Request
67
+
68
+ If you're developing both macroape and bioinform - it may be useful to know that test_helper and spec_helper expands require path in such a way that if you have two "cousin" folders: macroape and bioinform then macroape specs will require bioinform from development folder not from gem. It can save you lots of time not to rebuild-reinstall bioinform gem each time it get some changes
69
+
70
+ Copyright (c) 2011-2014 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
@@ -1,50 +1,50 @@
1
- #!/usr/bin/env rake
2
- require "bundler/gem_tasks"
3
- require 'rspec/core/rake_task'
4
- require 'rake/testtask'
5
-
6
- namespace :spec do
7
- Rake::TestTask.new do |t|
8
- t.libs << "test"
9
- t.test_files = FileList['test/*_test.rb']
10
- t.verbose = true
11
- end
12
- RSpec::Core::RakeTask.new
13
- end
14
-
15
- desc 'Test all functionality of gem executables'
16
- task :spec => ['spec:test', 'spec:spec']
17
-
18
- namespace :benchmark do
19
- task :run do
20
- require 'open3'
21
- time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
22
- File.open('benchmark/benchmark.log','a') do |f|
23
- f.puts "=========================================================\n#{time}\n"
24
- Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
25
- Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
26
- benchmark_name = File.basename(benchmark_filename)
27
- out_str = out.read
28
- err_str = err.read
29
-
30
- benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
31
- benchmark_infos_to_file = benchmark_infos
32
- puts benchmark_infos
33
-
34
- if err_str && !err_str.empty?
35
- STDERR.puts(err_str)
36
- benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
37
- end
38
-
39
- # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
40
- f.puts benchmark_infos_to_file
41
- end
42
- end
43
- end
44
- end
45
- task :show do
46
- puts File.read('benchmark/benchmark.log')
47
- end
48
- end
49
-
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+ require 'rake/testtask'
5
+
6
+ namespace :spec do
7
+ Rake::TestTask.new do |t|
8
+ t.libs << "test"
9
+ t.test_files = FileList['test/*_test.rb']
10
+ t.verbose = true
11
+ end
12
+ RSpec::Core::RakeTask.new
13
+ end
14
+
15
+ desc 'Test all functionality of gem executables'
16
+ task :spec => ['spec:test', 'spec:spec']
17
+
18
+ namespace :benchmark do
19
+ task :run do
20
+ require 'open3'
21
+ time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
22
+ File.open('benchmark/benchmark.log','a') do |f|
23
+ f.puts "=========================================================\n#{time}\n"
24
+ Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
25
+ Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
26
+ benchmark_name = File.basename(benchmark_filename)
27
+ out_str = out.read
28
+ err_str = err.read
29
+
30
+ benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
31
+ benchmark_infos_to_file = benchmark_infos
32
+ puts benchmark_infos
33
+
34
+ if err_str && !err_str.empty?
35
+ STDERR.puts(err_str)
36
+ benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
37
+ end
38
+
39
+ # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
40
+ f.puts benchmark_infos_to_file
41
+ end
42
+ end
43
+ end
44
+ end
45
+ task :show do
46
+ puts File.read('benchmark/benchmark.log')
47
+ end
48
+ end
49
+
50
50
  task :benchmark => 'benchmark:run'
data/TODO.txt CHANGED
@@ -1,47 +1,47 @@
1
- Why Helpers.find_threshold_output('SP1_f1.pwm 0.8 -d 10') on java works only for pvalues less or equal than 0.5, but 0.55 or 0.8 breaks the program (it doesn't stop). My hypothesis is that gauss threshold estimation fails
2
-
3
-
4
- Make --same-strand mode which disallows revcomp transformation of motif - so it'll be possible to compare RNA-PWMs.
5
-
6
- Fix align_motifs so that keys can go after arguments (use OptionParser in other words)
7
-
8
- ToDo:
9
- 6)
10
- # TODO: FIX: this test fails due to floating point precision error: estimated threshold is -19.0418 but '-19.0418'.to_f * 10000 = -190417.99999999997
11
- # A workaround exists: we can use fractions, i.e. ('-19.0418'.to_r * 10000).to_f = -190418.0 but it obscures code and being used uncarefully can involve huge slowdown.
12
- # I think, it'd be used only at input to workaround discretization issue
13
- #
14
- # def test_process_large_pvalue_floating_point_error
15
- # pvalue, threshold, real_pvalue = nil, nil, nil
16
- # assert_nothing_raised {
17
- # pvalue, threshold, real_pvalue = Helpers.find_threshold_output('KLF4_f2.pwm -p 0.8').strip.split("\t")
18
- # }
19
- # assert_equal '0.8', pvalue
20
- # assert_equal Helpers.obtain_pvalue_by_threshold("KLF4_f2.pwm #{threshold}"), real_pvalue
21
- # end
22
- 7)thresholds and thresholds_weak should return a collection (Array or Hash) when block not given
23
- merge this two methods into one parametrized method
24
- 8)(TODO: for theoretically consistency, while making small inconsistences to old calculations)
25
- When we work with strong threshold, we round matrix up(in order to overrate threshold comparing to real thus taking underrated pvalue) and take upper bound of discrete-thresholds fork.
26
- When we are estimating lower bound of threshold (weak threshold) we take lower bound of fork of discrete thresholds. But we should ALSO (not done yet) take matrix discreted down! This'd allow us give exact answer on a question in which range real threshold should lay with given P-value, now we correctly estimate only lower bound of threshold(upper bound of P-value)
27
- 9) (may be) Option to specify predefined query motif threshold in scan_collection
28
- 10) Fix Readme!
29
-
30
- Specs and tests:
31
- create spec on use of MaxHashSize, MaxHashSizeDouble
32
- create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
33
- create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
34
-
35
- Ideas to increase perfomance:
36
- - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions (??! it can significantly obscure code because thresholds will be changed too, and I can't tell what is better: slight perfomance optimization or conciseness of code)
37
- - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
38
- - Create JAVA extension for alignment_intersection methods in order to increase perfomance
39
- - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
40
-
41
- Usability issues:
42
- make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
43
-
44
- remove .stdin placeholder. Use tty? method instead
45
-
46
- use OptionParser or docopt
1
+ Why Helpers.find_threshold_output('SP1_f1.pwm 0.8 -d 10') on java works only for pvalues less or equal than 0.5, but 0.55 or 0.8 breaks the program (it doesn't stop). My hypothesis is that gauss threshold estimation fails
2
+
3
+
4
+ Make --same-strand mode which disallows revcomp transformation of motif - so it'll be possible to compare RNA-PWMs.
5
+
6
+ Fix align_motifs so that keys can go after arguments (use OptionParser in other words)
7
+
8
+ ToDo:
9
+ 6)
10
+ # TODO: FIX: this test fails due to floating point precision error: estimated threshold is -19.0418 but '-19.0418'.to_f * 10000 = -190417.99999999997
11
+ # A workaround exists: we can use fractions, i.e. ('-19.0418'.to_r * 10000).to_f = -190418.0 but it obscures code and being used uncarefully can involve huge slowdown.
12
+ # I think, it'd be used only at input to workaround discretization issue
13
+ #
14
+ # def test_process_large_pvalue_floating_point_error
15
+ # pvalue, threshold, real_pvalue = nil, nil, nil
16
+ # assert_nothing_raised {
17
+ # pvalue, threshold, real_pvalue = Helpers.find_threshold_output('KLF4_f2.pwm -p 0.8').strip.split("\t")
18
+ # }
19
+ # assert_equal '0.8', pvalue
20
+ # assert_equal Helpers.obtain_pvalue_by_threshold("KLF4_f2.pwm #{threshold}"), real_pvalue
21
+ # end
22
+ 7)thresholds and thresholds_weak should return a collection (Array or Hash) when block not given
23
+ merge this two methods into one parametrized method
24
+ 8)(TODO: for theoretically consistency, while making small inconsistences to old calculations)
25
+ When we work with strong threshold, we round matrix up(in order to overrate threshold comparing to real thus taking underrated pvalue) and take upper bound of discrete-thresholds fork.
26
+ When we are estimating lower bound of threshold (weak threshold) we take lower bound of fork of discrete thresholds. But we should ALSO (not done yet) take matrix discreted down! This'd allow us give exact answer on a question in which range real threshold should lay with given P-value, now we correctly estimate only lower bound of threshold(upper bound of P-value)
27
+ 9) (may be) Option to specify predefined query motif threshold in scan_collection
28
+ 10) Fix Readme!
29
+
30
+ Specs and tests:
31
+ create spec on use of MaxHashSize, MaxHashSizeDouble
32
+ create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
33
+ create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
34
+
35
+ Ideas to increase perfomance:
36
+ - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions (??! it can significantly obscure code because thresholds will be changed too, and I can't tell what is better: slight perfomance optimization or conciseness of code)
37
+ - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
38
+ - Create JAVA extension for alignment_intersection methods in order to increase perfomance
39
+ - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
40
+
41
+ Usability issues:
42
+ make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
43
+
44
+ remove .stdin placeholder. Use tty? method instead
45
+
46
+ use OptionParser or docopt
47
47
  make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)z
@@ -1,5 +1,5 @@
1
- $bioinform_folder = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'bioinform', 'lib'))
2
- $LOAD_PATH.unshift $bioinform_folder
3
-
4
- require 'benchmark'
1
+ $bioinform_folder = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'bioinform', 'lib'))
2
+ $LOAD_PATH.unshift $bioinform_folder
3
+
4
+ require 'benchmark'
5
5
  require_relative '../lib/macroape'