macroape 3.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +61 -0
  5. data/Rakefile +7 -0
  6. data/bin/eval_alignment +3 -0
  7. data/bin/eval_similarity +3 -0
  8. data/bin/find_pvalue +3 -0
  9. data/bin/find_threshold +3 -0
  10. data/bin/preprocess_collection +3 -0
  11. data/bin/scan_collection +3 -0
  12. data/lib/macroape/aligned_pair_intersection.rb +136 -0
  13. data/lib/macroape/aligned_pair_metrics.rb +24 -0
  14. data/lib/macroape/aligned_pair_transformations.rb +23 -0
  15. data/lib/macroape/collection.rb +15 -0
  16. data/lib/macroape/count_by_threshold.rb +34 -0
  17. data/lib/macroape/exec/eval_alignment.rb +141 -0
  18. data/lib/macroape/exec/eval_similarity.rb +107 -0
  19. data/lib/macroape/exec/find_pvalue.rb +80 -0
  20. data/lib/macroape/exec/find_threshold.rb +76 -0
  21. data/lib/macroape/exec/preprocess_collection.rb +94 -0
  22. data/lib/macroape/exec/scan_collection.rb +124 -0
  23. data/lib/macroape/extract_pwm.rb +32 -0
  24. data/lib/macroape/gauss_estimation.rb +30 -0
  25. data/lib/macroape/matrix_information.rb +29 -0
  26. data/lib/macroape/matrix_on_background.rb +16 -0
  27. data/lib/macroape/matrix_transformations.rb +29 -0
  28. data/lib/macroape/pair_metrics.rb +9 -0
  29. data/lib/macroape/pair_transformations.rb +28 -0
  30. data/lib/macroape/pwm_compare.rb +10 -0
  31. data/lib/macroape/pwm_compare_aligned.rb +13 -0
  32. data/lib/macroape/single_matrix.rb +45 -0
  33. data/lib/macroape/support.rb +34 -0
  34. data/lib/macroape/threshold_by_pvalue.rb +68 -0
  35. data/lib/macroape/version.rb +3 -0
  36. data/lib/macroape.rb +26 -0
  37. data/macroape.gemspec +17 -0
  38. data/test/data/AHR_si.pat +10 -0
  39. data/test/data/KLF4_f2.pat +11 -0
  40. data/test/data/KLF4_f2_scan_results_all.txt +4 -0
  41. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
  42. data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
  43. data/test/data/SP1_f1.pat +12 -0
  44. data/test/data/SP1_f1_revcomp.pat +12 -0
  45. data/test/data/test_collection/GABPA_f1.pat +14 -0
  46. data/test/data/test_collection/KLF4_f2.pat +11 -0
  47. data/test/data/test_collection/SP1_f1.pat +12 -0
  48. data/test/data/test_collection.yaml +186 -0
  49. data/test/macroape_test.rb +125 -0
  50. metadata +116 -0
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ TODO.txt
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in macroape.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Macroape
2
+
3
+ Macroape is abbreviation for MAtrix CompaRisOn by Approximate P-value Estimation. It's a bioinformatic tool for evaluating similarity measure between a pair of Position Weight Matrices. Used approach and application described in manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'macroape'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install macroape
18
+
19
+ ## Usage
20
+ For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
21
+
22
+ ## Basic usage as a command-line tool
23
+ MacroAPE have 6 command line tools:
24
+
25
+ ### Tools for calculating thresholds and pvalues:
26
+ * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
27
+ * find_pvalue \<PWM file\> \<threshold\>
28
+
29
+ ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
+ * eval_similarity \<first PWM file\> \<second PWM file\>
31
+ * eval_alignment \<first PWM file\> \<second PWM file\> \<shift of second matrix\> \<orientation of second matrix(direct|revcomp)\>
32
+
33
+ ### Tools for looking through collection for the motifs most similar to a query motif
34
+ * preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
35
+ * scan_collection \<query PWM file\> \<collection file\>
36
+
37
+ Also you can use -h option to print help for a tool in console.
38
+ There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
39
+
40
+ ## Basic usage in your code
41
+ require 'macroape'
42
+ background = [1,1,1,1]
43
+ discretization = 10
44
+ first_pwm_matrix = [[1,2,3,4], [1,2,3,4], [4,1,2,3,], [5,3,2,4], [4,1,2,3], [7,8,9,11]]
45
+ pwm_first = PWM::SingleMatrix.new(first_pwm_matrix).with_background(background).discrete(discretization)
46
+ pwm_second = PWM::SingleMatrix.load_pat('another_pwm.pat').with_background(background).discrete(discretization)
47
+ cmp = PWMCompare::PWMCompare.new(pwm_first, pwm_second)
48
+ first_threshold = pwm_first.threshold(pvalue)
49
+ second_threshold = pwm_second.threshold(pvalue)
50
+ similarity_info = cmp.jaccard(first_threshold, second_threshold)
51
+ puts "Jaccard similarity: #{similarity_info[:similarity]}"
52
+
53
+ ## Contributing
54
+
55
+ 1. Fork it
56
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
57
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
58
+ 4. Push to the branch (`git push origin my-new-feature`)
59
+ 5. Create new Pull Request
60
+
61
+ Copyright (c) 2011-2012 Ilya Vorontsov, Ivan Kulakovskiy, Vsevolod Makeev
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ desc 'Test all functionality of gem executables'
5
+ task :test do
6
+ system("ruby test/macroape_test.rb")
7
+ end
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/eval_alignment.rb'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/eval_similarity.rb'
data/bin/find_pvalue ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/find_pvalue.rb'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/find_threshold.rb'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/preprocess_collection.rb'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'macroape/exec/scan_collection.rb'
@@ -0,0 +1,136 @@
1
+ module PWMCompare
2
+ module AlignedPairIntersection
3
+
4
+ def counts_for_two_matrices(threshold_first, threshold_second)
5
+ if first.probabilities == second.probabilities
6
+ if first.probabilities == [1,1,1,1]
7
+ common_words_for_two_matrices(threshold_first, threshold_second)
8
+ else
9
+ counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
10
+ end
11
+ else
12
+ counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
13
+ end
14
+ end
15
+
16
+ def counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
17
+ scores = { 0 => {0 => [1,1]} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
18
+ result_first = 0.0
19
+ result_second = 0.0
20
+ length.times do |column|
21
+ ending_weight_first = first.sum_of_probabilities ** (length - column - 1)
22
+ ending_weight_second = second.sum_of_probabilities ** (length - column - 1)
23
+ already_enough_first = threshold_first - first.worst_suffix[column + 1]
24
+ already_enough_second = threshold_second - second.worst_suffix[column + 1]
25
+ least_sufficient_first = threshold_first - first.best_suffix[column + 1]
26
+ least_sufficient_second = threshold_second - second.best_suffix[column + 1]
27
+
28
+ new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=[0,0]}}
29
+ scores.each do |score_first, second_scores|
30
+ second_scores.each do |score_second, count|
31
+ 4.times do |letter|
32
+ new_score_first = score_first + first.matrix[column][letter]
33
+ if new_score_first >= already_enough_first
34
+ new_score_second = score_second + second.matrix[column][letter]
35
+ if new_score_second >= already_enough_second
36
+ result_first += count[0] * first.probabilities[letter] * ending_weight_first
37
+ result_second += count[1] * second.probabilities[letter] * ending_weight_second
38
+ elsif new_score_second >= least_sufficient_second
39
+ new_scores[new_score_first][new_score_second][0] += count[0] * first.probabilities[letter]
40
+ new_scores[new_score_first][new_score_second][1] += count[1] * second.probabilities[letter]
41
+ end
42
+ elsif new_score_first >= least_sufficient_first
43
+ new_score_second = score_second + second.matrix[column][letter]
44
+ if new_score_second >= least_sufficient_second
45
+ new_scores[new_score_first][new_score_second][0] += count[0] * first.probabilities[letter]
46
+ new_scores[new_score_first][new_score_second][1] += count[1] * second.probabilities[letter]
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ raise 'Hash overflow in PWMCompare::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
53
+ scores = new_scores
54
+ end
55
+ [result_first, result_second]
56
+ end
57
+
58
+ def counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
59
+ scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
60
+ result = 0.0
61
+ probabilities = first.probabilities
62
+ length.times do |column|
63
+ ending_weight = first.sum_of_probabilities ** (length - column - 1)
64
+ already_enough_first = threshold_first - first.worst_suffix[column + 1]
65
+ already_enough_second = threshold_second - second.worst_suffix[column + 1]
66
+ least_sufficient_first = threshold_first - first.best_suffix[column + 1]
67
+ least_sufficient_second = threshold_second - second.best_suffix[column + 1]
68
+
69
+ new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
70
+ scores.each do |score_first, second_scores|
71
+ second_scores.each do |score_second, count|
72
+ 4.times do |letter|
73
+ new_score_first = score_first + first.matrix[column][letter]
74
+ if new_score_first >= already_enough_first
75
+ new_score_second = score_second + second.matrix[column][letter]
76
+ if new_score_second >= already_enough_second
77
+ result += count * probabilities[letter] * ending_weight
78
+ elsif new_score_second >= least_sufficient_second
79
+ new_scores[new_score_first][new_score_second] += count * probabilities[letter]
80
+ end
81
+ elsif new_score_first >= least_sufficient_first
82
+ new_score_second = score_second + second.matrix[column][letter]
83
+ if new_score_second >= least_sufficient_second
84
+ new_scores[new_score_first][new_score_second] += count * probabilities[letter]
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ raise 'Hash overflow in PWMCompare::AlignedPairIntersection#counts_for_two_matrices_with_same_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
91
+ scores = new_scores
92
+ end
93
+ [result, result]
94
+ end
95
+
96
+
97
+ def common_words_for_two_matrices(threshold_first, threshold_second)
98
+ scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
99
+ result = 0
100
+ length.times do |column|
101
+ ending_weight = 4 ** (length - column - 1)
102
+ already_enough_first = threshold_first - first.worst_suffix[column + 1]
103
+ already_enough_second = threshold_second - second.worst_suffix[column + 1]
104
+ least_sufficient_first = threshold_first - first.best_suffix[column + 1]
105
+ least_sufficient_second = threshold_second - second.best_suffix[column + 1]
106
+
107
+ new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
108
+ scores.each do |score_first, second_scores|
109
+ second_scores.each do |score_second, count|
110
+ 4.times do |letter|
111
+ new_score_first = score_first + first.matrix[column][letter]
112
+ if new_score_first >= already_enough_first
113
+ new_score_second = score_second + second.matrix[column][letter]
114
+ if new_score_second >= already_enough_second
115
+ result += count * ending_weight
116
+ elsif new_score_second >= least_sufficient_second
117
+ new_scores[new_score_first][new_score_second] += count
118
+ end
119
+ elsif new_score_first >= least_sufficient_first
120
+ new_score_second = score_second + second.matrix[column][letter]
121
+ if new_score_second >= least_sufficient_second
122
+ new_scores[new_score_first][new_score_second] += count
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ raise 'Hash overflow in PWMCompare::AlignedPairIntersection#common_words_for_two_matrices' if defined? MaxHashSize and new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSize
130
+ scores = new_scores
131
+ end
132
+ [result, result]
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,24 @@
1
+ module PWMCompare
2
+ module AlignedPairMetrics
3
+ def jaccard(first_threshold, second_threshold)
4
+ f = first.counts_by_thresholds(first_threshold).first
5
+ s = second.counts_by_thresholds(second_threshold).first
6
+ if f == 0 or s == 0
7
+ return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
8
+ recognized_by_first: f,
9
+ recognized_by_second: s,
10
+ }
11
+ end
12
+
13
+ intersect = counts_for_two_matrices(first_threshold, second_threshold)
14
+ intersect = Math.sqrt(intersect[0] * intersect[1])
15
+ union = f + s - intersect
16
+ similarity = intersect.to_f / union
17
+ { similarity: similarity,
18
+ tanimoto: 1.0 - similarity,
19
+ recognized_by_both: intersect,
20
+ recognized_by_first: f,
21
+ recognized_by_second: s }
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ module PWMCompare
2
+ module AlignedPairTransformations
3
+
4
+ #def discrete(rate)
5
+ # PWMCompareAligned.new(first.discrete(rate), second.discrete(rate))
6
+ #end
7
+
8
+ def sort_pair_of_matrices_by(&block)
9
+ mat = first.pwm.zip(second.pwm).sort_by(&block).transpose
10
+ PWMCompareAligned.new(SinglePWM(mat[0],first.probabilities), SinglePWM(mat[1], second.probabilities))
11
+ end
12
+ def sort_decreasing_max
13
+ PWMCompareAligned.new(*sort_pair_of_matrices_by{|col_pair| -col_pair[0].max} )
14
+ end
15
+ def sort_increasing_min
16
+ PWMCompareAligned.new(*sort_pair_of_matrices_by{|col_pair| col_pair[0].min} )
17
+ end
18
+ def permute_columns(permutation_index)
19
+ PWMCompareAligned.new(first.permute(permutation_index), second.permute(permutation_index))
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ module PWM
2
+ class Collection
3
+ attr_reader :rough_discretization, :precise_discretization, :background, :pvalues, :pwms, :infos
4
+ def initialize(rough_discretization, precise_discretization, background, pvalues)
5
+ @rough_discretization, @precise_discretization, @background, @pvalues = rough_discretization, precise_discretization, background, pvalues
6
+ @pwms={}
7
+ @infos={}
8
+ end
9
+ def add_pwm(pwm,info)
10
+ #@pwms[pwm] = info
11
+ @pwms[pwm.name] = pwm
12
+ @infos[pwm.name] = info
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,34 @@
1
+ module PWM
2
+ module CountByThreshold
3
+ def counts_by_thresholds(*thresholds)
4
+ scores = calculate_count_distribution_after_threshold(thresholds.min)
5
+ =begin
6
+ thresholds.map{ |threshold|
7
+ #scores.select{|score,count| score >= threshold}.map{|score,count| count}.inject(0){|sum,val|sum+val}
8
+ scores.inject(0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
9
+ }
10
+ =end
11
+ s_thr= thresholds.map.with_index{|threshold,index|[threshold,index]}.sort_by{|threshold,index| threshold}
12
+
13
+ cnt = 0
14
+ thr_cnts=[]
15
+
16
+ scores.sort.reverse.each do |score,count|
17
+ while !s_thr.empty? and score < s_thr.last[0]
18
+ thr_cnts.push(cnt)
19
+ s_thr.pop
20
+ end
21
+ cnt += count
22
+ end
23
+ s_thr = thresholds.map.with_index{|threshold,index|[threshold,index]}.sort_by{|threshold,index| threshold}
24
+ while thr_cnts.size < s_thr.size
25
+ thr_cnts.push(cnt)
26
+ end
27
+ s_thr.reverse.zip(thr_cnts).sort_by{|(threshold,index), count| index}.map{|(threshold,index), count| count.to_f}
28
+ end
29
+
30
+ def pvalue_by_threshold(threshold)
31
+ counts_by_thresholds(threshold).first / number_of_words
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,141 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
4
+ type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
5
+ or in linux
6
+ cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
7
+
8
+ Options:
9
+ [-p <P-value>]
10
+ [-d <discretization level>]
11
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
12
+
13
+ Output format:
14
+ <jaccard similarity coefficient>
15
+ <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
16
+ <aligned 1st matrix>
17
+ <aligned 2nd matrix>
18
+ <shift> <orientation>
19
+
20
+ Examples:
21
+ ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
22
+ or on windows
23
+ type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
24
+ or in linux
25
+ cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
26
+ }
27
+
28
+ require 'macroape'
29
+
30
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
31
+ STDERR.puts help_string
32
+ exit
33
+ end
34
+
35
+ pvalue = 0.0005
36
+ discretization = 10
37
+
38
+ first_background = [1,1,1,1]
39
+ second_background = [1,1,1,1]
40
+
41
+ begin
42
+ first_file = ARGV.shift
43
+ second_file = ARGV.shift
44
+
45
+ shift = ARGV.shift
46
+ orientation = ARGV.shift
47
+
48
+ raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
49
+ raise 'You\'d specify shift' unless shift
50
+ raise 'You\'d specify orientation' unless orientation
51
+
52
+ shift = shift.to_i
53
+ orientation = orientation.to_sym
54
+
55
+ case orientation
56
+ when :direct
57
+ reverse = false
58
+ when :revcomp
59
+ reverse = true
60
+ else
61
+ raise 'Unknown orientation(direct/revcomp)'
62
+ end
63
+
64
+
65
+ until ARGV.empty?
66
+ case ARGV.shift
67
+ when '-p'
68
+ pvalue = ARGV.shift.to_f
69
+ when '-d'
70
+ discretization = ARGV.shift.to_f
71
+ when '-m'
72
+ PWM::MaxHashSize = ARGV.shift.to_f
73
+ when '-md'
74
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
75
+ when '-b'
76
+ second_background = first_background = ARGV.shift(4).map(&:to_f)
77
+ when '-b1'
78
+ first_background = ARGV.shift(4).map(&:to_f)
79
+ when '-b2'
80
+ second_background = ARGV.shift(4).map(&:to_f)
81
+ end
82
+ end
83
+ raise 'background should be symmetric' unless first_background == first_background.reverse
84
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
85
+
86
+
87
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
88
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
89
+
90
+ if first_file == '.stdin' || second_file == '.stdin'
91
+ r_stream, w_stream = IO.pipe
92
+ STDIN.readlines.each{|line| w_stream.write(line)}
93
+ w_stream.close
94
+ end
95
+
96
+ if first_file == '.stdin'
97
+ r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
98
+ pwm_first = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
99
+ else
100
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
101
+ pwm_first = PWM::SingleMatrix.load_pat(first_file).with_background(first_background).discrete(discretization)
102
+ end
103
+
104
+ if second_file == '.stdin'
105
+ r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
106
+ pwm_second = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
107
+ else
108
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
109
+ pwm_second = PWM::SingleMatrix.load_pat(second_file).with_background(second_background).discrete(discretization)
110
+ end
111
+
112
+ r_stream.close if first_file == '.stdin' || second_file == '.stdin'
113
+
114
+
115
+ pwm_second = pwm_second.reverse_complement if reverse
116
+
117
+ first_pwm_alignment = '.' * [-shift, 0].max + '>' * pwm_first.length
118
+ second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * pwm_second.length
119
+ overlap = [pwm_first.length + [-shift,0].max, pwm_second.length + [shift,0].max].min - shift.abs
120
+ alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
121
+ (first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
122
+ (second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
123
+
124
+ cmp = PWMCompare::PWMCompareAligned.new(pwm_first.left_augment([-shift,0].max),
125
+ pwm_second.left_augment([shift,0].max))
126
+
127
+ first_threshold = pwm_first.threshold(pvalue)
128
+ second_threshold = pwm_second.threshold(pvalue)
129
+
130
+ info = cmp.jaccard(first_threshold, second_threshold).merge(
131
+ text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
132
+ shift: shift,
133
+ orientation: orientation,
134
+ overlap: overlap,
135
+ alignment_length: alignment_length)
136
+
137
+ puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
138
+
139
+ rescue => err
140
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
141
+ end
@@ -0,0 +1,107 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
4
+ or on windows
5
+ type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
6
+ or in linux
7
+ cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
8
+
9
+ Options:
10
+ [-p <P-value>]
11
+ [-d <discretization level>]
12
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
13
+
14
+ Output has format:
15
+ <jaccard similarity coefficient>
16
+ <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
17
+ <optimal alignment, the 1st matrix>
18
+ <optimal alignment, the 2nd matrix>
19
+ <shift> <orientation>
20
+
21
+ Examples:
22
+ ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
23
+ or on windows
24
+ type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
25
+ or in linux
26
+ cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
27
+ }
28
+
29
+ require 'macroape'
30
+
31
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
32
+ STDERR.puts help_string
33
+ exit
34
+ end
35
+
36
+ pvalue = 0.0005
37
+ discretization = 10
38
+
39
+ first_background = [1,1,1,1]
40
+ second_background = [1,1,1,1]
41
+
42
+ begin
43
+ first_file = ARGV.shift
44
+ second_file = ARGV.shift
45
+ raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
46
+
47
+ until ARGV.empty?
48
+ case ARGV.shift
49
+ when '-p'
50
+ pvalue = ARGV.shift.to_f
51
+ when '-d'
52
+ discretization = ARGV.shift.to_f
53
+ when '-m'
54
+ PWM::MaxHashSize = ARGV.shift.to_f
55
+ when '-md'
56
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
57
+ when '-b'
58
+ second_background = first_background = ARGV.shift(4).map(&:to_f)
59
+ when '-b1'
60
+ first_background = ARGV.shift(4).map(&:to_f)
61
+ when '-b2'
62
+ second_background = ARGV.shift(4).map(&:to_f)
63
+ end
64
+ end
65
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
66
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background == second_background.reverse
67
+
68
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
69
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
70
+
71
+
72
+ if first_file == '.stdin' || second_file == '.stdin'
73
+ r_stream, w_stream = IO.pipe
74
+ STDIN.readlines.each{|line| w_stream.write(line)}
75
+ w_stream.close
76
+ end
77
+
78
+ if first_file == '.stdin'
79
+ r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
80
+ pwm_first = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(first_background).discrete(discretization)
81
+ else
82
+ raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
83
+ pwm_first = PWM::SingleMatrix.load_pat(first_file).with_background(first_background).discrete(discretization)
84
+ end
85
+
86
+ if second_file == '.stdin'
87
+ r_stream, w_stream, extracted_pwm = extract_pwm(r_stream, w_stream)
88
+ pwm_second = PWM::SingleMatrix.load_from_line_array(extracted_pwm).with_background(second_background).discrete(discretization)
89
+ else
90
+ raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
91
+ pwm_second = PWM::SingleMatrix.load_pat(second_file).with_background(second_background).discrete(discretization)
92
+ end
93
+
94
+ r_stream.close if first_file == '.stdin' || second_file == '.stdin'
95
+
96
+ cmp = PWMCompare::PWMCompare.new(pwm_first, pwm_second)
97
+
98
+ first_threshold = pwm_first.threshold(pvalue)
99
+ second_threshold = pwm_second.threshold(pvalue)
100
+
101
+ info = cmp.jaccard(first_threshold, second_threshold)
102
+
103
+ puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
104
+
105
+ rescue => err
106
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
107
+ end