macroape 3.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +61 -0
  5. data/Rakefile +7 -0
  6. data/bin/eval_alignment +3 -0
  7. data/bin/eval_similarity +3 -0
  8. data/bin/find_pvalue +3 -0
  9. data/bin/find_threshold +3 -0
  10. data/bin/preprocess_collection +3 -0
  11. data/bin/scan_collection +3 -0
  12. data/lib/macroape/aligned_pair_intersection.rb +136 -0
  13. data/lib/macroape/aligned_pair_metrics.rb +24 -0
  14. data/lib/macroape/aligned_pair_transformations.rb +23 -0
  15. data/lib/macroape/collection.rb +15 -0
  16. data/lib/macroape/count_by_threshold.rb +34 -0
  17. data/lib/macroape/exec/eval_alignment.rb +141 -0
  18. data/lib/macroape/exec/eval_similarity.rb +107 -0
  19. data/lib/macroape/exec/find_pvalue.rb +80 -0
  20. data/lib/macroape/exec/find_threshold.rb +76 -0
  21. data/lib/macroape/exec/preprocess_collection.rb +94 -0
  22. data/lib/macroape/exec/scan_collection.rb +124 -0
  23. data/lib/macroape/extract_pwm.rb +32 -0
  24. data/lib/macroape/gauss_estimation.rb +30 -0
  25. data/lib/macroape/matrix_information.rb +29 -0
  26. data/lib/macroape/matrix_on_background.rb +16 -0
  27. data/lib/macroape/matrix_transformations.rb +29 -0
  28. data/lib/macroape/pair_metrics.rb +9 -0
  29. data/lib/macroape/pair_transformations.rb +28 -0
  30. data/lib/macroape/pwm_compare.rb +10 -0
  31. data/lib/macroape/pwm_compare_aligned.rb +13 -0
  32. data/lib/macroape/single_matrix.rb +45 -0
  33. data/lib/macroape/support.rb +34 -0
  34. data/lib/macroape/threshold_by_pvalue.rb +68 -0
  35. data/lib/macroape/version.rb +3 -0
  36. data/lib/macroape.rb +26 -0
  37. data/macroape.gemspec +17 -0
  38. data/test/data/AHR_si.pat +10 -0
  39. data/test/data/KLF4_f2.pat +11 -0
  40. data/test/data/KLF4_f2_scan_results_all.txt +4 -0
  41. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +3 -0
  42. data/test/data/KLF4_f2_scan_results_precise_mode.txt +4 -0
  43. data/test/data/SP1_f1.pat +12 -0
  44. data/test/data/SP1_f1_revcomp.pat +12 -0
  45. data/test/data/test_collection/GABPA_f1.pat +14 -0
  46. data/test/data/test_collection/KLF4_f2.pat +11 -0
  47. data/test/data/test_collection/SP1_f1.pat +12 -0
  48. data/test/data/test_collection.yaml +186 -0
  49. data/test/macroape_test.rb +125 -0
  50. metadata +116 -0
@@ -0,0 +1,80 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby find_pvalue.rb <pat-file> <threshold list> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
6
+ or on windows
7
+ type <pat-file> | ruby find_pvalue.rb .stdin <threshold> [options]
8
+
9
+ Options:
10
+ [-d <discretization level>]
11
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
12
+
13
+ Output format:
14
+ threshold_1 count_1 pvalue_1
15
+ threshold_2 count_2 pvalue_2
16
+ threshold_3 count_3 pvalue_3
17
+ The results are printed out in the same order as in the given threshold list.
18
+
19
+ Examples:
20
+ ruby find_pvalue.rb motifs/KLF4.pat 7.32 -d 1000 -b 0.2 0.3 0.2 0.3
21
+ or on windows
22
+ type motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
23
+ or in linux
24
+ cat motifs/KLF4.pat | ruby find_pvalue.rb .stdin 7.32 4.31 5.42
25
+ }
26
+
27
+ require 'macroape'
28
+
29
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
30
+ STDERR.puts help_string
31
+ exit
32
+ end
33
+
34
+ discretization = 10000
35
+ background = [1,1,1,1]
36
+ thresholds = []
37
+ begin
38
+ filename = ARGV.shift
39
+
40
+ loop do
41
+ begin
42
+ Float(ARGV.first)
43
+ thresholds << ARGV.shift.to_f
44
+ rescue
45
+ raise StopIteration
46
+ end
47
+ end
48
+
49
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
50
+ raise 'You should specify at least one threshold' if thresholds.empty?
51
+
52
+ until ARGV.empty?
53
+ case ARGV.shift
54
+ when '-b'
55
+ background = ARGV.shift(4).map(&:to_f)
56
+ when '-d'
57
+ discretization = ARGV.shift.to_f
58
+ when '-m'
59
+ PWM::MaxHashSize = ARGV.shift.to_f
60
+ end
61
+ end
62
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
63
+
64
+
65
+ if filename == '.stdin'
66
+ pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
67
+ else
68
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
+ pwm = PWM::SingleMatrix.load_pat(filename)
70
+ end
71
+ pwm = pwm.with_background(background)
72
+
73
+ counts = pwm.discrete(discretization).counts_by_thresholds(* thresholds.map{|count| count * discretization})
74
+ pvalues = counts.map{|count| count.to_f / pwm.number_of_words}
75
+ pvalues.zip(thresholds,counts).each{|pvalue,threshold,count|
76
+ puts "#{threshold}\t#{count}\t#{pvalue}"
77
+ }
78
+ rescue => err
79
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
80
+ end
@@ -0,0 +1,76 @@
1
+ help_string = %q{
2
+ Command-line format::
3
+ ruby find_threshold.rb <pat-file> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby find_threshold.rb .stdin [options]
6
+ or on windows
7
+ type <pat-file> | ruby find_threshold.rb .stdin [options]
8
+
9
+ Options:
10
+ [-p <list of P-values>]
11
+ [-d <discretization level>]
12
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
13
+
14
+ Output format:
15
+ requested_pvalue_1 threshold_1 achieved_pvalue_1
16
+ requested_pvalue_2 threshold_2 achieved_pvalue_2
17
+
18
+
19
+ Example:
20
+ ruby find_threshold.rb motifs/KLF4.pat -p 0.001 0.0001 0.0005 -d 1000 -b 0.4 0.3 0.2 0.1
21
+ }
22
+
23
+ require 'macroape'
24
+
25
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
26
+ STDERR.puts help_string
27
+ exit
28
+ end
29
+
30
+ background = [1,1,1,1]
31
+ default_pvalues = [0.0005]
32
+ discretization = 10000
33
+
34
+ begin
35
+ filename = ARGV.shift
36
+ raise "No input. You'd specify input source: filename or .stdin" unless filename
37
+
38
+ pvalues = []
39
+ until ARGV.empty?
40
+ case ARGV.shift
41
+ when '-b'
42
+ background = ARGV.shift(4).map(&:to_f)
43
+ when '-m'
44
+ PWM::MaxHashSize = ARGV.shift.to_f
45
+ when '-p'
46
+ loop do
47
+ begin
48
+ Float(ARGV.first)
49
+ pvalues << ARGV.shift.to_f
50
+ rescue
51
+ raise StopIteration
52
+ end
53
+ end
54
+ when '-d'
55
+ discretization = ARGV.shift.to_f
56
+ end
57
+ end
58
+ pvalues = default_pvalues if pvalues.empty?
59
+
60
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
61
+
62
+ if filename == '.stdin'
63
+ pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
64
+ else
65
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
66
+ pwm = PWM::SingleMatrix.load_pat(filename)
67
+ end
68
+
69
+ pwm = pwm.with_background(background)
70
+
71
+ pwm.discrete(discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
72
+ puts "#{pvalue}\t#{threshold / discretization}\t#{real_pvalue}"
73
+ end
74
+ rescue => err
75
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
76
+ end
@@ -0,0 +1,94 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby preprocess_collection.rb <folder with PWMs> [options]
4
+
5
+ Options:
6
+ [-p <list of P-values>]
7
+ [-d <rough discretization> <precise discretization>]
8
+ [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
9
+ [-o <output file>]
10
+ [--silent] - don't show current progress information during scan (by default this information's written into stderr)
11
+
12
+ The tool stores preprocessed PWM collection to the specified YAML-file.
13
+
14
+ Example:
15
+ ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml
16
+ }
17
+
18
+ require 'macroape'
19
+
20
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
21
+ STDERR.puts help_string
22
+ exit
23
+ end
24
+
25
+ default_pvalues = [0.0005]
26
+ background = [1,1,1,1]
27
+ rough_discretization = 1
28
+ precise_discretization = 10
29
+ output_file = 'collection.yaml'
30
+
31
+ begin
32
+ folder = ARGV.shift
33
+ raise "No input. You'd specify folder with pat-files" unless folder
34
+ raise "Error! Folder #{folder} doesn't exist" unless Dir.exist?(folder)
35
+
36
+ pvalues = []
37
+ silent = false
38
+ until ARGV.empty?
39
+ case ARGV.shift
40
+ when '-b'
41
+ background = ARGV.shift(4).map(&:to_f)
42
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse
43
+ when '-p'
44
+ loop do
45
+ begin
46
+ Float(ARGV.first)
47
+ pvalues << ARGV.shift.to_f
48
+ rescue
49
+ raise StopIteration
50
+ end
51
+ end
52
+ when '-d'
53
+ rough_discretization, precise_discretization = ARGV.shift(2).map(&:to_f).sort
54
+ when '-o'
55
+ output_file = ARGV.shift
56
+ when '-m'
57
+ PWM::MaxHashSize = ARGV.shift.to_f
58
+ when '-md'
59
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
60
+ when '--silent'
61
+ silent = true
62
+ end
63
+ end
64
+ pvalues = default_pvalues if pvalues.empty?
65
+
66
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
67
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
68
+
69
+ collection = PWM::Collection.new(rough_discretization, precise_discretization, background, pvalues)
70
+
71
+ current_dir = File.dirname(__FILE__)
72
+ Dir.glob(File.join(folder,'*')) do |filename|
73
+ STDERR.puts filename unless silent
74
+ pwm = PWM::SingleMatrix.load_pat(filename)
75
+ info = {rough: {}, precise: {}}
76
+ output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{rough_discretization}`.split("\n")
77
+ output.each do |line|
78
+ pvalue, threshold, real_pvalue = line.split.map(&:to_f)
79
+ info[:rough][pvalue] = threshold
80
+ end
81
+
82
+ output = `ruby "#{File.join current_dir,'find_threshold.rb'}" #{filename} -p #{pvalues.join(' ')} -b #{background.join(' ')} -d #{precise_discretization}`.split("\n")
83
+ output.each do |line|
84
+ pvalue, threshold, real_pvalue = line.split.map(&:to_f)
85
+ info[:precise][pvalue] = threshold
86
+ end
87
+ collection.add_pwm(pwm, info)
88
+ end
89
+ File.open(output_file,'w') do |f|
90
+ f.puts(collection.to_yaml)
91
+ end
92
+ rescue => err
93
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
94
+ end
@@ -0,0 +1,124 @@
1
+ help_string = %q{
2
+ Command-line format:
3
+ ruby scan_collection.rb <pat-file> <collection> [options]
4
+ or in linux
5
+ cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
6
+ or on windows
7
+ type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
8
+
9
+ Options:
10
+ [-p <P-value>]
11
+ [-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
12
+ [--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
13
+ [--silent] - don't show current progress information during scan (by default this information's written into stderr)
14
+
15
+ Output format:
16
+ <name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
17
+ Attention! Name can contain whitespace characters.
18
+ Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
19
+
20
+ Example:
21
+ ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
22
+ or in linux
23
+ cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
24
+ }
25
+
26
+ require 'macroape'
27
+
28
+ if ARGV.empty? or ARGV.include? '-h' or ARGV.include? '-help' or ARGV.include? '--help' or ARGV.include? '--h'
29
+ STDERR.puts help_string
30
+ exit
31
+ end
32
+
33
+ begin
34
+ filename = ARGV.shift
35
+ collection_file = ARGV.shift
36
+ raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
37
+ raise "No input. You'd specify input file with collection" unless collection_file
38
+ raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
39
+
40
+ pvalue = 0.0005
41
+ cutoff = 0.05 # minimal similarity to output
42
+ collection = YAML.load_file(collection_file)
43
+ background_query = collection.background
44
+
45
+ silent = false
46
+ precision_mode = :rough
47
+ until ARGV.empty?
48
+ case ARGV.shift
49
+ when '-bq'
50
+ background_query = ARGV.shift(4).map(&:to_f)
51
+ raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
52
+ when '-p'
53
+ pvalue = ARGV.shift.to_f
54
+ when '-m'
55
+ PWM::MaxHashSize = ARGV.shift.to_f
56
+ when '-md'
57
+ PWMCompare::MaxHashSize = ARGV.shift.to_f
58
+ when '-c'
59
+ cutoff = ARGV.shift.to_f
60
+ when '--all'
61
+ cutoff = 0.0
62
+ when '--silent'
63
+ silent = true
64
+ when '--precise'
65
+ precision_mode = :precise
66
+ begin
67
+ Float(ARGV.first)
68
+ minimal_similarity = ARGV.shift.to_f
69
+ rescue
70
+ minimal_similarity = 0.05
71
+ end
72
+ end
73
+ end
74
+ PWM::MaxHashSize = 1000000 unless defined? PWM::MaxHashSize
75
+ PWMCompare::MaxHashSize = 1000 unless defined? PWMCompare::MaxHashSize
76
+
77
+ raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
78
+
79
+ if filename == '.stdin'
80
+ query_pwm = PWM::SingleMatrix.load_from_stdin(STDIN)
81
+ else
82
+ raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
83
+ query_pwm = PWM::SingleMatrix.load_pat(filename)
84
+ end
85
+
86
+
87
+ query_pwm_rough = query_pwm.with_background(background_query).discrete(collection.rough_discretization)
88
+ query_pwm_precise = query_pwm.with_background(background_query).discrete(collection.precise_discretization)
89
+
90
+ threshold = query_pwm_rough.threshold(pvalue)
91
+ threshold_precise = query_pwm_precise.threshold(pvalue)
92
+
93
+ similarities = {}
94
+ precision_file_mode = {}
95
+ unnamed_index = 0
96
+
97
+ collection.pwms.each_key do |name|
98
+ pwm = collection.pwms[name]
99
+ pwm_info = collection.infos[name]
100
+ STDERR.puts pwm.name unless silent
101
+ cmp = PWMCompare::PWMCompare.new(query_pwm_rough, pwm.with_background(collection.background).discrete(collection.rough_discretization))
102
+ info = cmp.jaccard(threshold, pwm_info[:rough][pvalue] * collection.rough_discretization)
103
+ name = pwm.name || "Unnamed #{unnamed_index += 1}"
104
+ precision_file_mode[name] = :rough
105
+
106
+ if precision_mode == :precise and info[:similarity] >= minimal_similarity
107
+ cmp = PWMCompare::PWMCompare.new(query_pwm_precise, pwm.with_background(collection.background).discrete(collection.precise_discretization))
108
+ info = cmp.jaccard(threshold_precise, pwm_info[:precise][pvalue] * collection.precise_discretization)
109
+ precision_file_mode[name] = :precise
110
+ end
111
+ similarities[name] = info
112
+ end
113
+
114
+ puts "#pwm\tsimilarity\tshift\toverlap\torientation"
115
+ similarities.sort_by do |name, info|
116
+ info[:similarity]
117
+ end.reverse.each do |name, info|
118
+ precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
119
+ puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
120
+ end
121
+
122
+ rescue => err
123
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
124
+ end
@@ -0,0 +1,32 @@
1
+ # r_stream, w_stream - supposed to be a pipe. Data's read from r_stream, pwm's extracted, remaining data pushed back to w_stream
2
+ # ... --> w_stream --> r_stream --> data
3
+ # ^ |
4
+ # | v
5
+ # ... <-- w_stream <-- ... --> extracted pwm
6
+ def extract_pwm(r_stream, w_stream)
7
+ lines = r_stream.readlines
8
+ return [r_stream, w_stream, nil] if lines.empty?
9
+
10
+ extracted_pwm = [lines.shift]
11
+ while extracted_pwm.last.chomp == ''
12
+ extracted_pwm = [lines.shift.strip]
13
+ return [r_stream, w_stream, nil] unless extracted_pwm.last
14
+ end
15
+
16
+ r_stream.close
17
+ begin
18
+ until lines.empty?
19
+ line = lines.shift
20
+ line.split.each{|x| Float(x) } # raises error if string is not a numeric
21
+ raise 'Not a PWM string (too little number of numbers - may be empty string or name of next pwm). PWM finished' if line.split.size < 4
22
+ extracted_pwm << line
23
+ end
24
+ rescue
25
+ lines.unshift(line)
26
+ end
27
+ new_r_stream, new_w_stream = IO.pipe
28
+ lines.each{|one_line| new_w_stream.write(one_line)}
29
+ new_w_stream.close
30
+
31
+ [new_r_stream, new_w_stream, extracted_pwm]
32
+ end
@@ -0,0 +1,30 @@
1
+ module PWM
2
+ module GaussEstimation
3
+ def score_mean
4
+ bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
5
+ matrix.inject(0.0){ |mean, col| mean + 4.times.inject(0.0){|sum,letter| sum + col[letter] * bckgr[letter]} }
6
+ end
7
+ def score_variance
8
+ bckgr = probabilities.map{|v| v.to_f / sum_of_probabilities}
9
+ matrix.inject(0.0) do |variance, col|
10
+ variance + 4.times.inject(0.0) { |sum,letter| sum + col[letter]**2 * bckgr[letter] } -
11
+ 4.times.inject(0.0) { |sum,letter| sum + col[letter] * bckgr[letter] }**2
12
+ end
13
+ end
14
+ def threshold_gauss_estimation(pvalue)
15
+ sigma = Math.sqrt(score_variance)
16
+ n_ = inverf2(1 - 2 * pvalue) * Math.sqrt(2)
17
+ score_mean + n_ * sigma
18
+ end
19
+ def inverf2(x)
20
+ sign = x < 0 ? -1 : 1
21
+ x = x.abs
22
+ a = 8 / (3*Math::PI) * (Math::PI-3) / (4-Math::PI)
23
+ part0 = ( 2/(Math::PI*a) + (Math.log(1-x*x)) / 2 )**2
24
+ part = -2 / (Math::PI * a) - Math.log(1-x*x)/2 + Math.sqrt(-1/a *
25
+ Math.log(1-x*x) + part0)
26
+ sign * Math.sqrt(part)
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,29 @@
1
+ module PWM
2
+ module MatrixInformation
3
+ def length
4
+ @length ||= matrix.length
5
+ end
6
+ def best_score
7
+ @best_score ||= matrix.inject(0){|sum, col| sum + col.max}
8
+ end
9
+ def worst_score
10
+ @worst_score ||= matrix.inject(0){|sum, col| sum + col.min}
11
+ end
12
+ def best_suffix
13
+ return @best_suffix if @best_suffix
14
+ @best_suffix = Array.new(length + 1, 0) # best score of suffix s[i..l]
15
+ length.times{|i| @best_suffix[length - i - 1] = matrix[length - i - 1].max + @best_suffix[length - i] }
16
+ @best_suffix
17
+ end
18
+ def worst_suffix
19
+ return @worst_suffix if @worst_suffix
20
+ @worst_suffix = Array.new(length + 1, 0)
21
+ length.times{|i| @worst_suffix[length - i - 1] = matrix[length - i - 1].min + @worst_suffix[length - i] }
22
+ @worst_suffix
23
+ end
24
+ def refresh_infos
25
+ @length = @best_score = @worst_score = @best_suffix = @worst_suffix = nil
26
+ self
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,16 @@
1
+ module PWM
2
+ class MatrixOnBackground < SingleMatrix
3
+ attr_reader :probabilities
4
+ def initialize(matrix,background)
5
+ super(matrix)
6
+ @probabilities = background
7
+ end
8
+ def sum_of_probabilities
9
+ @sum_of_probabilities ||= probabilities.inject(0.0, &:+)
10
+ end
11
+ def number_of_words
12
+ sum_of_probabilities ** length
13
+ end
14
+ include GaussEstimation, ThresholdByPvalue, CountByThreshold
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module PWM
2
+ module MatrixTransformations
3
+ def reverse_complement
4
+ clone_and_transform( matrix.reverse.map(&:reverse) ).refresh_infos
5
+ end
6
+ def left_augment(n)
7
+ clone_and_transform( [[0.0]*4]* n + matrix ).refresh_infos
8
+ end
9
+ def right_augment(n)
10
+ clone_and_transform( matrix + [[0.0]*4]* n ).refresh_infos
11
+ end
12
+ def shift_to_zero # make worst score == 0 by shifting scores of each column
13
+ clone_and_transform( matrix.map{|col| col.map{|letter| letter - col.min}} ).refresh_infos
14
+ end
15
+ def discrete(rate)
16
+ clone_and_transform( matrix.map{|col| col.map{|letter| (letter * rate).ceil}} ).refresh_infos
17
+ end
18
+ def split(length_of_first_part)
19
+ [clone_and_transform( matrix.first(length_of_first_part)).refresh_infos, clone_and_transform(matrix.last(length - length_of_first_part)).refresh_infos]
20
+ end
21
+ def permute_columns(permutation_index)
22
+ clone_and_transform( permutation_index.map{|col| matrix[col]} ).refresh_infos
23
+ end
24
+
25
+ def clone_and_transform(new_matrix)
26
+ self.dup.instance_eval{ @matrix = new_matrix; self }
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ module PWMCompare
2
+ module PairMetrics
3
+ def jaccard(threshold_first, threshold_second)
4
+ self.map_each_align do |align, alignment_info|
5
+ align.jaccard(threshold_first, threshold_second).merge(alignment_info)
6
+ end.max_by {|alignment_info| alignment_info[:similarity]}
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,28 @@
1
+ module PWMCompare
2
+ module PairTransformations
3
+
4
+ def each
5
+ second_rc = second.reverse_complement
6
+ (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
7
+ first_pwm_alignment = '.' * [-shift, 0].max + '>' * first.length
8
+ second_pwm_alignment = '.' * [shift, 0].max + (orientation == :direct ? '>' : '<') * second.length
9
+ overlap = [first.length + [-shift,0].max, second.length + [shift,0].max].min - shift.abs
10
+ alignment_length = [first_pwm_alignment.length, second_pwm_alignment.length].max
11
+ (first_pwm_alignment.length...alignment_length).each{|i| first_pwm_alignment[i] = '.'}
12
+ (second_pwm_alignment.length...alignment_length).each{|i| second_pwm_alignment[i] = '.'}
13
+
14
+ yield(PWMCompareAligned.new(first.left_augment([-shift,0].max),
15
+ (orientation == :direct ? second : second_rc).left_augment([shift,0].max)),
16
+ text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
17
+ shift: shift,
18
+ orientation: orientation,
19
+ overlap: overlap,
20
+ alignment_length: alignment_length
21
+ )
22
+ end
23
+ end
24
+ include Enumerable
25
+ alias :each_align :each
26
+ alias :map_each_align :map
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module PWMCompare
2
+ class PWMCompare
3
+ attr_reader :first, :second
4
+ def initialize(first, second)
5
+ @first = first
6
+ @second = second
7
+ end
8
+ include PairTransformations, PairMetrics
9
+ end
10
+ end
@@ -0,0 +1,13 @@
1
+ module PWMCompare
2
+ class PWMCompareAligned
3
+ attr_reader :first, :second, :length
4
+ def initialize(first, second)
5
+ @length = [first.length, second.length].max
6
+ @first = first.right_augment(@length - first.length)
7
+ @second = second.right_augment(@length - second.length)
8
+ end
9
+
10
+ include AlignedPairTransformations, AlignedPairMetrics, AlignedPairIntersection
11
+
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ module PWM
2
+ class SingleMatrix
3
+ attr_reader :matrix
4
+ attr_accessor :name
5
+ def initialize(matrix)
6
+ @matrix = matrix
7
+ end
8
+ include MatrixTransformations, MatrixInformation
9
+
10
+ def self.build_matrix(lines, name = nil)
11
+ pwm_name = name
12
+ begin
13
+ lines.first.split.each{|x| Float(x) }
14
+ start_line = 0
15
+ rescue
16
+ start_line = 1
17
+ pwm_name = lines.first.chomp.match(/(?:>\s)?(.*)$/)[1]
18
+ end
19
+
20
+ if lines[start_line].split.length == 4
21
+ pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)})
22
+ else
23
+ pwm = SingleMatrix.new(lines[start_line..-1].map{|str| str.split.map(&:to_f)}.transpose)
24
+ end
25
+ raise "PWM::SingleMatrix.build_matrix can't create matrix using this input" unless pwm.matrix.all?{|l| l.length == 4}
26
+ pwm.name = pwm_name
27
+ pwm
28
+ end
29
+
30
+ def self.load_from_stdin(input_stream, name = nil)
31
+ build_matrix(input_stream.readlines, name)
32
+ end
33
+ def self.load_from_line_array(lines, name = nil)
34
+ build_matrix(lines, name)
35
+ end
36
+
37
+ def self.load_pat(filename)
38
+ build_matrix( File.open(filename,'r'){|f| f.readlines}, File.basename_wo_ext(filename))
39
+ end
40
+
41
+ def with_background(background)
42
+ type_cast(MatrixOnBackground){@probabilities = background}.depth_dup
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,34 @@
1
+ module Kernel
2
+ def type_cast(new_class, &block)
3
+ new_obj = new_class.allocate
4
+ instance_variables.each do |varname|
5
+ new_obj.instance_variable_set(varname, self.instance_variable_get(varname))
6
+ end
7
+ new_obj.instance_eval(&block) if block_given?
8
+ new_obj
9
+ end
10
+
11
+ def depth_dup
12
+ begin
13
+ new_obj = self.dup
14
+ rescue
15
+ return self
16
+ end
17
+ new_obj.instance_variables.each do |varname|
18
+ begin
19
+ new_obj.instance_variable_set(varname, new_obj.instance_variable_get(varname).depth_dup)
20
+ rescue
21
+ end
22
+ end
23
+ new_obj
24
+ end
25
+
26
+ end
27
+
28
+ def File.filename_wo_ext(filename)
29
+ filename[0..-(1+File.extname(filename).length)]
30
+ end
31
+
32
+ def File.basename_wo_ext(filename)
33
+ File.basename(filename)[0..-(1+File.extname(filename).length)]
34
+ end