macroape 3.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +61 -0
  5. data/Rakefile +7 -0
  6. data/bin/eval_alignment +3 -0
  7. data/bin/eval_similarity +3 -0
  8. data/bin/find_pvalue +3 -0
  9. data/bin/find_threshold +3 -0
  10. data/bin/preprocess_collection +3 -0
  11. data/bin/scan_collection +3 -0
  12. data/lib/macroape/aligned_pair_intersection.rb +136 -0
  13. data/lib/macroape/aligned_pair_metrics.rb +24 -0
  14. data/lib/macroape/aligned_pair_transformations.rb +23 -0
  15. data/lib/macroape/collection.rb +15 -0
  16. data/lib/macroape/count_by_threshold.rb +34 -0
  17. data/lib/macroape/exec/eval_alignment.rb +141 -0
  18. data/lib/macroape/exec/eval_similarity.rb +107 -0
  19. data/lib/macroape/exec/find_pvalue.rb +80 -0
  20. data/lib/macroape/exec/find_threshold.rb +76 -0
  21. data/lib/macroape/exec/preprocess_collection.rb +94 -0
  22. data/lib/macroape/exec/scan_collection.rb +124 -0
  23. data/lib/macroape/extract_pwm.rb +32 -0
  24. data/lib/macroape/gauss_estimation.rb +30 -0
  25. data/lib/macroape/matrix_information.rb +29 -0
  26. data/lib/macroape/matrix_on_background.rb +16 -0
  27. data/lib/macroape/matrix_transformations.rb +29 -0
  28. data/lib/macroape/pair_metrics.rb +9 -0
  29. data/lib/macroape/pair_transformations.rb +28 -0
  30. data/lib/macroape/pwm_compare.rb +10 -0
  31. data/lib/macroape/pwm_compare_aligned.rb +13 -0
  32. data/lib/macroape/single_matrix.rb +45 -0
  33. data/lib/macroape/support.rb +34 -0
  34. data/lib/macroape/threshold_by_pvalue.rb +68 -0
  35. data/lib/macroape/version.rb +3 -0
  36. data/lib/macroape.rb +26 -0
  37. data/macroape.gemspec +17 -0
  38. data/test/data/AHR_si.pat +10 -0
  39. data/test/data/KLF4_f2.pat +11 -0
  40. data/test/data/KLF4_f2_scan_results_all.txt +4 -0
  41. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
  42. data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
  43. data/test/data/SP1_f1.pat +12 -0
  44. data/test/data/SP1_f1_revcomp.pat +12 -0
  45. data/test/data/test_collection/GABPA_f1.pat +14 -0
  46. data/test/data/test_collection/KLF4_f2.pat +11 -0
  47. data/test/data/test_collection/SP1_f1.pat +12 -0
  48. data/test/data/test_collection.yaml +186 -0
  49. data/test/macroape_test.rb +125 -0
  50. metadata +116 -0
@@ -0,0 +1,80 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby find_pvalue.rb <pat-file> <threshold list> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
6
+ or on windows
7
+ type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
8
+
9
+ Options:
10
+ [-d <discretization level>]
11
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
12
+
13
+ Output format:
14
+ threshold_1 count_1 pvalue_1
15
+ threshold_2 count_2 pvalue_2
16
+ threshold_3 count_3 pvalue_3
17
+ The results are printed out in the same order as in the given threshold list.
18
+
19
+ Examples:
20
+ ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
21
+ or on windows
22
+ type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
23
+ or in linux
24
+ cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
25
+ }
26
+
27
+ require 'macroape'
28
+
29
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
30
+ STDERR.puts help_string
31
+ exit
32
+ end
33
+
34
+ discretization = 10000
35
+ background = [1,1,1,1]
36
+ thresholds = []
37
+ begin
38
+ filename = ARGV.shift
39
+
40
+ loop do
41
+ begin
42
+ Float(ARGV.first)
43
+ thresholds << ARGV.shift.to_f
44
+ rescue
45
+ raise StopIteration
46
+ end
47
+ end
48
+
49
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
50
+ raise 'You should specify at least one threshold' if thresholds.empty?
51
+
52
+ until ARGV.empty?
53
+ case ARGV.shift
54
+ when '-b'
55
+ background = ARGV.shift(4).map(&:to_f)
56
+ when '-d'
57
+ discretization = ARGV.shift.to_f
58
+ when '-m'
59
+ PWM::MaxHashSize = ARGV.shift.to_f
60
+ end
61
+ end
62
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
63
+
64
+
65
+ if filename == '.stdin'
66
+ pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
67
+ else
68
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
+ pwm = PWM::SingleMatrix.load_pat(filename)
70
+ end
71
+ pwm = pwm.with_background(background)
72
+
73
+ counts = pwm.discrete(discretization).counts_by_thresholds(* thresholds.map{|count| count * discretization})
74
+ pvalues = counts.map{|count| count.to_f / pwm.number_of_words}
75
+ pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
76
+ puts "#{threshold}\t#{count}\t#{pvalue}"
77
+ }
78
+ rescue => err
79
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
80
+ end
@@ -0,0 +1,76 @@
1
+ help_string = %q{
2
+ Command-line format::
3
+ ruby find_threshold.rb <pat-file> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby find_threshold.rb .stdin [options]
6
+ or on windows
7
+ type <pat-file> | ruby find_threshold.rb .stdin [options]
8
+
9
+ Options:
10
+ [-p <list of P-values>]
11
+ [-d <discretization level>]
12
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
13
+
14
+ Output format:
15
+ requested_pvalue_1 threshold_1 achieved_pvalue_1
16
+ requested_pvalue_2 threshold_2 achieved_pvalue_2
17
+
18
+
19
+ Example:
20
+ ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
21
+ }
22
+
23
+ require 'macroape'
24
+
25
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
26
+ STDERR.puts help_string
27
+ exit
28
+ end
29
+
30
+ background = [1,1,1,1]
31
+ default_pvalues = [0.0005]
32
+ discretization = 10000
33
+
34
+ begin
35
+ filename = ARGV.shift
36
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
37
+
38
+ pvalues = []
39
+ until ARGV.empty?
40
+ case ARGV.shift
41
+ when '-b'
42
+ background = ARGV.shift(4).map(&:to_f)
43
+ when '-m'
44
+ PWM::MaxHashSize = ARGV.shift.to_f
45
+ when '-p'
46
+ loop do
47
+ begin
48
+ Float(ARGV.first)
49
+ pvalues << ARGV.shift.to_f
50
+ rescue
51
+ raise StopIteration
52
+ end
53
+ end
54
+ when '-d'
55
+ discretization = ARGV.shift.to_f
56
+ end
57
+ end
58
+ pvalues = default_pvalues if pvalues.empty?
59
+
60
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
61
+
62
+ if filename == '.stdin'
63
+ pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
64
+ else
65
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
66
+ pwm = PWM::SingleMatrix.load_pat(filename)
67
+ end
68
+
69
+ pwm = pwm.with_background(background)
70
+
71
+ pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
72
+ puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
73
+ end
74
+ rescue => err
75
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
76
+ end
@@ -0,0 +1,94 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby preprocess_collection.rb <folder with PWMs> [options]
4
+
5
+ Options:
6
+ [-p <list of P-values>]
7
+ [-d <rough discretization> <precise discretization>]
8
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
9
+ [-o <output file>]
10
+ [--silent] - don't show current progress information during scan (by default this information's written into stderr)
11
+
12
+ The tool stores preprocessed PWM collection to the specified YAML-file.
13
+
14
+ Example:
15
+ ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
16
+ }
17
+
18
+ require 'macroape'
19
+
20
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
21
+ STDERR.puts help_string
22
+ exit
23
+ end
24
+
25
+ default_pvalues = [0.0005]
26
+ background = [1,1,1,1]
27
+ rough_discretization = 1
28
+ precise_discretization = 10
29
+ output_file = 'collection.yaml'
30
+
31
+ begin
32
+ folder = ARGV.shift
33
+ raise "No input. You'd specify folder with pat-files" unless folder
34
+ raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
35
+
36
+ pvalues = []
37
+ silent = false
38
+ until ARGV.empty?
39
+ case ARGV.shift
40
+ when '-b'
41
+ background = ARGV.shift(4).map(&:to_f)
42
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
43
+ when '-p'
44
+ loop do
45
+ begin
46
+ Float(ARGV.first)
47
+ pvalues << ARGV.shift.to_f
48
+ rescue
49
+ raise StopIteration
50
+ end
51
+ end
52
+ when '-d'
53
+ rough_discretization, precise_discretization = ARGV.shift(2).map(&:to_f).sort
54
+ when '-o'
55
+ output_file = ARGV.shift
56
+ when '-m'
57
+ PWM::MaxHashSize = ARGV.shift.to_f
58
+ when '-md'
59
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
60
+ when '--silent'
61
+ silent = true
62
+ end
63
+ end
64
+ pvalues = default_pvalues if pvalues.empty?
65
+
66
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
67
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
68
+
69
+ collection = PWM::Collection.new(rough_discretization, precise_discretization, background, pvalues)
70
+
71
+ current_dir = File.dirname(__FILE__)
72
+ Dir.glob(File.join(folder,'*')) do |filename|
73
+ STDERR.puts filename unless silent
74
+ pwm = PWM::SingleMatrix.load_pat(filename)
75
+ info = {rough: {}, precise: {}}
76
+ output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{rough_discretization}`.split("\n")
77
+ output.each do |line|
78
+ pvalue, threshold, real_pvalue = line.split.map(&:to_f)
79
+ info[:rough][pvalue] = threshold
80
+ end
81
+
82
+ output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{precise_discretization}`.split("\n")
83
+ output.each do |line|
84
+ pvalue, threshold, real_pvalue = line.split.map(&:to_f)
85
+ info[:precise][pvalue] = threshold
86
+ end
87
+ collection.add_pwm(pwm, info)
88
+ end
89
+ File.open(output_file,'w') do |f|
90
+ f.puts(collection.to_yaml)
91
+ end
92
+ rescue => err
93
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
94
+ end
@@ -0,0 +1,124 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby scan_collection.rb <pat-file> <collection> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
6
+ or on windows
7
+ type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
8
+
9
+ Options:
10
+ [-p <P-value>]
11
+ [-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
12
+ [--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
13
+ [--silent] - don't show current progress information during scan (by default this information's written into stderr)
14
+
15
+ Output format:
16
+ <name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
17
+ Attention! Name can contain whitespace characters.
18
+ Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
19
+
20
+ Example:
21
+ ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
22
+ or in linux
23
+ cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
24
+ }
25
+
26
+ require 'macroape'
27
+
28
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
29
+ STDERR.puts help_string
30
+ exit
31
+ end
32
+
33
+ begin
34
+ filename = ARGV.shift
35
+ collection_file = ARGV.shift
36
+ raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
37
+ raise "No input. You'd specify input file with collection" unless collection_file
38
+ raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
39
+
40
+ pvalue = 0.0005
41
+ cutoff = 0.05 # minimal similarity to output
42
+ collection = YAML.load_file(collection_file)
43
+ background_query = collection.background
44
+
45
+ silent = false
46
+ precision_mode = :rough
47
+ until ARGV.empty?
48
+ case ARGV.shift
49
+ when '-bq'
50
+ background_query = ARGV.shift(4).map(&:to_f)
51
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
52
+ when '-p'
53
+ pvalue = ARGV.shift.to_f
54
+ when '-m'
55
+ PWM::MaxHashSize = ARGV.shift.to_f
56
+ when '-md'
57
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
58
+ when '-c'
59
+ cutoff = ARGV.shift.to_f
60
+ when '--all'
61
+ cutoff = 0.0
62
+ when '--silent'
63
+ silent = true
64
+ when '--precise'
65
+ precision_mode = :precise
66
+ begin
67
+ Float(ARGV.first)
68
+ minimal_similarity = ARGV.shift.to_f
69
+ rescue
70
+ minimal_similarity = 0.05
71
+ end
72
+ end
73
+ end
74
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
75
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
76
+
77
+ raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
78
+
79
+ if filename == '.stdin'
80
+ query_pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
81
+ else
82
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
83
+ query_pwm = PWM::SingleMatrix.load_pat(filename)
84
+ end
85
+
86
+
87
+ query_pwm_rough = query_pwm.with_background(background_query).discrete(collection.rough_discretization)
88
+ query_pwm_precise = query_pwm.with_background(background_query).discrete(collection.precise_discretization)
89
+
90
+ threshold = query_pwm_rough.threshold(pvalue)
91
+ threshold_precise = query_pwm_precise.threshold(pvalue)
92
+
93
+ similarities = {}
94
+ precision_file_mode = {}
95
+ unnamed_index = 0
96
+
97
+ collection.pwms.each_key do |name|
98
+ pwm = collection.pwms[name]
99
+ pwm_info = collection.infos[name]
100
+ STDERR.puts pwm.name unless silent
101
+ cmp = PWMCompare::PWMCompare.new(query_pwm_rough, pwm.with_background(collection.background).discrete(collection.rough_discretization))
102
+ info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
103
+ name = pwm.name || "Unnamed #{unnamed_index += 1}"
104
+ precision_file_mode[name] = :rough
105
+
106
+ if precision_mode == :precise and info[:similarity] >= minimal_similarity
107
+ cmp = PWMCompare::PWMCompare.new(query_pwm_precise, pwm.with_background(collection.background).discrete(collection.precise_discretization))
108
+ info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
109
+ precision_file_mode[name] = :precise
110
+ end
111
+ similarities[name] = info
112
+ end
113
+
114
+ puts "#pwm\tsimilarity\tshift\toverlap\torientation"
115
+ similarities.sort_by do |name, info|
116
+ info[:similarity]
117
+ end.reverse.each do |name, info|
118
+ precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
119
+ puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
120
+ end
121
+
122
+ rescue => err
123
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
124
+ end
@@ -0,0 +1,32 @@
1
+ # r_stream, w_stream - supposed to be a pipe. Data's read from r_stream, pwm's extracted, remaining data pushed back to w_stream
2
+ # ... --> w_stream --> r_stream --> data
3
+ # ^ |
4
+ # | v
5
+ # ... <-- w_stream <-- ... --> extracted pwm
6
+ def extract_pwm(r_stream, w_stream)
7
+ lines = r_stream.readlines
8
+ return [r_stream, w_stream, nil] if lines.empty?
9
+
10
+ extracted_pwm = [lines.shift]
11
+ while extracted_pwm.last.chomp == ''
12
+ extracted_pwm = [lines.shift.strip]
13
+ return [r_stream, w_stream, nil] unless extracted_pwm.last
14
+ end
15
+
16
+ r_stream.close
17
+ begin
18
+ until lines.empty?
19
+ line = lines.shift
20
+ line.split.each{|x| Float(x) } # raises error if string is not a numeric
21
+ raise 'Not a PWM string (too little number of numbers - may be empty string or name of next pwm). PWM finished' if line.split.size < 4
22
+ extracted_pwm << line
23
+ end
24
+ rescue
25
+ lines.unshift(line)
26
+ end
27
+ new_r_stream, new_w_stream = IO.pipe
28
+ lines.each{|one_line| new_w_stream.write(one_line)}
29
+ new_w_stream.close
30
+
31
+ [new_r_stream, new_w_stream, extracted_pwm]
32
+ end
@@ -0,0 +1,30 @@
1
+ module PWM
2
+ module GaussEstimation
3
+ def score_mean
4
+ bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
5
+ matrix.inject(0.0){ |mean, col| mean + 4.times.inject(0.0){|sum,letter| sum + col[letter] * bckgr[letter]} }
6
+ end
7
+ def score_variance
8
+ bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
9
+ matrix.inject(0.0) do |variance, col|
10
+ variance + 4.times.inject(0.0) { |sum,letter| sum + col[letter]**2 * bckgr[letter] } -
11
+ 4.times.inject(0.0) { |sum,letter| sum + col[letter] * bckgr[letter] }**2
12
+ end
13
+ end
14
+ def threshold_gauss_estimation(pvalue)
15
+ sigma = Math.sqrt(score_variance)
16
+ n_ = inverf2(1 - 2 * pvalue) * Math.sqrt(2)
17
+ score_mean + n_ * sigma
18
+ end
19
+ def inverf2(x)
20
+ sign = x < 0 ? -1 : 1
21
+ x = x.abs
22
+ a = 8 / (3*Math::PI) * (Math::PI-3) / (4-Math::PI)
23
+ part0 = ( 2/(Math::PI*a) + (Math.log(1-x*x)) / 2 )**2
24
+ part = -2 / (Math::PI * a) - Math.log(1-x*x)/2 + Math.sqrt(-1/a *
25
+ Math.log(1-x*x) + part0)
26
+ sign * Math.sqrt(part)
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,29 @@
1
+ module PWM
2
+ module MatrixInformation
3
+ def length
4
+ @length ||= matrix.length
5
+ end
6
+ def best_score
7
+ @best_score ||= matrix.inject(0){|sum, col| sum + col.max}
8
+ end
9
+ def worst_score
10
+ @worst_score ||= matrix.inject(0){|sum, col| sum + col.min}
11
+ end
12
+ def best_suffix
13
+ return @best_suffix if @best_suffix
14
+ @best_suffix = Array.new(length + 1, 0) # best score of suffix s[i..l]
15
+ length.times{|i| @best_suffix[length - i - 1] = matrix[length - i - 1].max + @best_suffix[length - i] }
16
+ @best_suffix
17
+ end
18
+ def worst_suffix
19
+ return @worst_suffix if @worst_suffix
20
+ @worst_suffix = Array.new(length + 1, 0)
21
+ length.times{|i| @worst_suffix[length - i - 1] = matrix[length - i - 1].min + @worst_suffix[length - i] }
22
+ @worst_suffix
23
+ end
24
+ def refresh_infos
25
+ @length = @best_score = @worst_score = @best_suffix = @worst_suffix = nil
26
+ self
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,16 @@
1
+ module PWM
2
+ class MatrixOnBackground < SingleMatrix
3
+ attr_reader :probabilities
4
+ def initialize(matrix,background)
5
+ super(matrix)
6
+ @probabilities = background
7
+ end
8
+ def sum_of_probabilities
9
+ @sum_of_probabilities ||= probabilities.inject(0.0, &:+)
10
+ end
11
+ def number_of_words
12
+ sum_of_probabilities ** length
13
+ end
14
+ include GaussEstimation, ThresholdByPvalue, CountByThreshold
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module PWM
2
+ module MatrixTransformations
3
+ def reverse_complement
4
+ clone_and_transform( matrix.reverse.map(&:reverse) ).refresh_infos
5
+ end
6
+ def left_augment(n)
7
+ clone_and_transform( [[0.0]*4]* n + matrix ).refresh_infos
8
+ end
9
+ def right_augment(n)
10
+ clone_and_transform( matrix + [[0.0]*4]* n ).refresh_infos
11
+ end
12
+ def shift_to_zero # make worst score == 0 by shifting scores of each column
13
+ clone_and_transform( matrix.map{|col| col.map{|letter| letter - col.min}} ).refresh_infos
14
+ end
15
+ def discrete(rate)
16
+ clone_and_transform( matrix.map{|col| col.map{|letter| (letter * rate).ceil}} ).refresh_infos
17
+ end
18
+ def split(length_of_first_part)
19
+ [clone_and_transform( matrix.first(length_of_first_part)).refresh_infos, clone_and_transform(matrix.last(length - length_of_first_part)).refresh_infos]
20
+ end
21
+ def permute_columns(permutation_index)
22
+ clone_and_transform( permutation_index.map{|col| matrix[col]} ).refresh_infos
23
+ end
24
+
25
+ def clone_and_transform(new_matrix)
26
+ self.dup.instance_eval{ @matrix = new_matrix; self }
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ module PWMCompare
2
+ module PairMetrics
3
+ def jaccard(threshold_first, threshold_second)
4
+ self.map_each_align do |align, alignment_info|
5
+ align.jaccard(threshold_first, threshold_second).merge(alignment_info)
6
+ end.max_by {|alignment_info| alignment_info[:similarity]}
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,28 @@
1
+ module PWMCompare
2
+ module PairTransformations
3
+
4
+ def each
5
+ second_rc = second.reverse_complement
6
+ (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
7
+ first_pwm_alignment = '.' * [-shift, 0].max + '>' * first.length
8
+ second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * second.length
9
+ overlap = [first.length + [-shift,0].max, second.length + [shift,0].max].min - shift.abs
10
+ alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
11
+ (first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
12
+ (second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
13
+
14
+ yield(PWMCompareAligned.new(first.left_augment([-shift,0].max),
15
+ (orientation == :direct ? second : second_rc).left_augment([shift,0].max)),
16
+ text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
17
+ shift: shift,
18
+ orientation: orientation,
19
+ overlap: overlap,
20
+ alignment_length: alignment_length
21
+ )
22
+ end
23
+ end
24
+ include Enumerable
25
+ alias :each_align :each
26
+ alias :map_each_align :map
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module PWMCompare
2
+ class PWMCompare
3
+ attr_reader :first, :second
4
+ def initialize(first, second)
5
+ @first = first
6
+ @second = second
7
+ end
8
+ include PairTransformations, PairMetrics
9
+ end
10
+ end
@@ -0,0 +1,13 @@
1
+ module PWMCompare
2
+ class PWMCompareAligned
3
+ attr_reader :first, :second, :length
4
+ def initialize(first, second)
5
+ @length = [first.length, second.length].max
6
+ @first = first.right_augment(@length - first.length)
7
+ @second = second.right_augment(@length - second.length)
8
+ end
9
+
10
+ include AlignedPairTransformations, AlignedPairMetrics, AlignedPairIntersection
11
+
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ module PWM
2
+ class SingleMatrix
3
+ attr_reader :matrix
4
+ attr_accessor :name
5
+ def initialize(matrix)
6
+ @matrix = matrix
7
+ end
8
+ include MatrixTransformations, MatrixInformation
9
+
10
+ def self.build_matrix(lines, name = nil)
11
+ pwm_name = name
12
+ begin
13
+ lines.first.split.each{|x| Float(x) }
14
+ start_line = 0
15
+ rescue
16
+ start_line = 1
17
+ pwm_name = lines.first.chomp.match(/(?:>\s)?(.*)$/)[1]
18
+ end
19
+
20
+ if lines[start_line].split.length == 4
21
+ pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)})
22
+ else
23
+ pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)}.transpose)
24
+ end
25
+ raise "PWM::SingleMatrix.build_matrix can't create matrix using this input" unless pwm.matrix.all?{|l| l.length == 4}
26
+ pwm.name = pwm_name
27
+ pwm
28
+ end
29
+
30
+ def self.load_from_stdin(input_stream, name = nil)
31
+ build_matrix(input_stream.readlines, name)
32
+ end
33
+ def self.load_from_line_array(lines, name = nil)
34
+ build_matrix(lines, name)
35
+ end
36
+
37
+ def self.load_pat(filename)
38
+ build_matrix( File.open(filename,'r'){|f| f.readlines}, File.basename_wo_ext(filename))
39
+ end
40
+
41
+ def with_background(background)
42
+ type_cast(MatrixOnBackground){@probabilities = background}.depth_dup
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,34 @@
1
+ module Kernel
2
+ def type_cast(new_class, &block)
3
+ new_obj = new_class.allocate
4
+ instance_variables.each do |varname|
5
+ new_obj.instance_variable_set(varname, self.instance_variable_get(varname))
6
+ end
7
+ new_obj.instance_eval(&block) if block_given?
8
+ new_obj
9
+ end
10
+
11
+ def depth_dup
12
+ begin
13
+ new_obj = self.dup
14
+ rescue
15
+ return self
16
+ end
17
+ new_obj.instance_variables.each do |varname|
18
+ begin
19
+ new_obj.instance_variable_set(varname, new_obj.instance_variable_get(varname).depth_dup)
20
+ rescue
21
+ end
22
+ end
23
+ new_obj
24
+ end
25
+
26
+ end
27
+
28
+ def File.filename_wo_ext(filename)
29
+ filename[0..-(1+File.extname(filename).length)]
30
+ end
31
+
32
+ def File.basename_wo_ext(filename)
33
+ File.basename(filename)[0..-(1+File.extname(filename).length)]
34
+ end