sequence_logo 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/ruby
2
+
3
+ module Ytilib
4
+ require "rexml/document"
5
+ include REXML
6
+
7
+ class Bismark < Document
8
+
9
+ def initialize(source = nil, add_dtd = false)
10
+ dtd = add_dtd ? "<!DOCTYPE smallbismark SYSTEM 'smallbismark.dtd'>#{$/}" : ""
11
+ source == nil ? super("<?xml version='1.0' encoding='UTF-8'?>#{$/}#{dtd}") : super(source)
12
+ super(IO.read(source)) if source != nil && root == nil
13
+ if source == nil
14
+ self.add_element("smallbismark")
15
+ # xmlns breaks XPath for a REXML library under Linux, strange, indeed
16
+ # self.add_element("smallbismark", {"xmlns" => "http://bioinform.imb.ac.ru/smallBiSMark/smallbismark.dtd"})
17
+ self.root.add_element("comment", {"name" => "WARNING"}).add_text("This is a draft version of small-BiSMark. Specification is the subject to change!")
18
+ end
19
+ end
20
+
21
+ def getXML
22
+ beautify
23
+ s = ""; write(s, 1, true)
24
+ s.rstrip!
25
+ return s
26
+ end
27
+ alias get_xml getXML
28
+
29
+ def get_pm(xpath)
30
+ pwmnode = self.elements[xpath]
31
+ pm = PM.new_pm(pwmnode.attribute("length").value.to_i)
32
+ toi = pwmnode.name == "PCM"
33
+ pwmnode.elements.each("pm-column") { |c|
34
+ position = c.attribute("position").value.to_i - 1
35
+ weights = [c.elements["a"].get_text.value.strip.to_f,
36
+ c.elements["c"].get_text.value.strip.to_f,
37
+ c.elements["g"].get_text.value.strip.to_f,
38
+ c.elements["t"].get_text.value.strip.to_f]
39
+ weights.collect { |w| w.to_i } if toi
40
+ pm['A'][position], pm['C'][position], pm['G'][position], pm['T'][position] = weights[0], weights[1], weights[2], weights[3]
41
+ }
42
+ return pm
43
+ end
44
+
45
+ private
46
+ CONTAIN_NO_TEXT = {
47
+ "segment" => :vasya_shmyak,
48
+ "group" => :vasya_shmyak,
49
+ "smallbismark" => :vasya_shmyak,
50
+ "motif" => :vasya_shmyak,
51
+ "PWM" => :vasya_shmyak,
52
+ "PCM" => :vasya_shmyak,
53
+ "PPM" => :vasya_shmyak,
54
+ "source" => :vasya_shmyak,
55
+ "factor" => :vasya_shmyak,
56
+ "pm-column" => :vasya_shmyak,
57
+ "word-list" => :vasya_shmyak}
58
+
59
+ def beautify(node = self)
60
+ if node == self
61
+ self.delete_if { |e| e.is_a?(Text) }
62
+ self.each { |e| beautify(e) }
63
+ else
64
+ node.delete_if { |e| e.is_a?(Text) } if node.respond_to?(:delete_if) && Bismark::CONTAIN_NO_TEXT.has_key?(node.name)
65
+ node.each { |e| beautify(e) } if node.respond_to?(:each)
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ end
@@ -0,0 +1,75 @@
1
+ require 'rexml/formatters/pretty'
2
+
3
+ module REXML
4
+ module Formatters
5
+ # The Transitive formatter writes an XML document that parses to an
6
+ # identical document as the source document. This means that no extra
7
+ # whitespace nodes are inserted, and whitespace within text nodes is
8
+ # preserved. Within these constraints, the document is pretty-printed,
9
+ # with whitespace inserted into the metadata to introduce formatting.
10
+ #
11
+ # Note that this is only useful if the original XML is not already
12
+ # formatted. Since this formatter does not alter whitespace nodes, the
13
+ # results of formatting already formatted XML will be odd.
14
+ class Transitive < Default
15
+ def initialize( indentation=2 )
16
+ @indentation = indentation
17
+ @level = 0
18
+ end
19
+
20
+ protected
21
+ def write_element( node, output )
22
+ output << "\n" << ' '*@level
23
+ output << "<#{node.expanded_name}"
24
+
25
+ node.attributes.each_attribute do |attr|
26
+ output << " "
27
+ attr.write( output )
28
+ end unless node.attributes.empty?
29
+
30
+ if node.children.empty?
31
+ output << "/>"
32
+ else
33
+ output << ">"
34
+ # If compact and all children are text, and if the formatted output
35
+ # is less than the specified width, then try to print everything on
36
+ # one line
37
+ skip = false
38
+ @level += @indentation
39
+
40
+ only_text = true
41
+
42
+ node.children.each { |child|
43
+ only_text = child.is_a?(REXML::Text) && only_text
44
+ write( child, output )
45
+ }
46
+ @level -= @indentation
47
+ output << "#{only_text ? "" : "\n" + ' '*@level}" << "</#{node.expanded_name}>"
48
+ end
49
+
50
+ end
51
+
52
+ def write_text( node, output )
53
+ output << node.to_s()
54
+ end
55
+ end
56
+ end
57
+
58
+ class Document
59
+ def write( output=$stdout, indent=-1, trans=false, ie_hack=false )
60
+ if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output)
61
+ output = Output.new( output, xml_decl.encoding )
62
+ end
63
+ formatter = if indent > -1
64
+ if trans
65
+ REXML::Formatters::Transitive.new( indent )
66
+ else
67
+ REXML::Formatters::Pretty.new( indent, ie_hack )
68
+ end
69
+ else
70
+ REXML::Formatters::Default.new( ie_hack )
71
+ end
72
+ formatter.write( self, output )
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/ruby
2
+
3
+ class Float
4
+ # Using Stieltjes formula from http://www.luschny.de/math/factorial/approx/SimpleCases.html
5
+ def log_fact
6
+ return 0.0 if self <= 1
7
+ a0 = 1.0/12
8
+ a1 = 1.0/30
9
+ a2 = 53.0/210
10
+ a3 = 195.0/371
11
+ a4 = 22999.0/22737
12
+ a5 = 29944523.0/19733142
13
+ a6 = 109535241009.0/48264275462
14
+ z_big = self+1;
15
+ (1.0/2)*Math.log(2*Math::PI)+(z_big-1.0/2)*Math.log(z_big)-z_big + a0/(z_big+a1/(z_big+a2/(z_big+a3/(z_big+a4/(z_big+a5/(z_big+a6/z_big))))))
16
+ end
17
+ end
18
+
19
+ class Integer
20
+ def log_fact
21
+ self.to_f.log_fact
22
+ end
23
+ end
24
+
25
+ # Naive version
26
+ =begin
27
+ class Integer
28
+ @@fact_hash = {}
29
+ def log_fact
30
+ return 0.0 if self == 0
31
+ return nil if self < 0
32
+ if self <= 170
33
+ @@fact_hash[self] = Math.log( lambda { |k| return k if self.times { |i| k *= i.next } }.call(1) )
34
+ else
35
+ return self.to_f.log_fact
36
+ end unless @@fact_hash.has_key?(self)
37
+ return @@fact_hash[self]
38
+ end
39
+ end
40
+ =end
41
+
42
+ module Ytilib
43
+ class PM
44
+ def infocod(position = nil)
45
+ return infocod_private(position) if position
46
+ (0...@size).collect { |i| infocod_private(i) }
47
+ end
48
+ alias icd infocod
49
+
50
+ def icd2of4(floor = false)
51
+ i2o4 = @words_count / 2.0
52
+ i2o4 = i2o4.floor if floor
53
+ ([i2o4, i2o4, 0, 0].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
54
+ # 0 is equal to @words_count % 2, because 0! = 1!
55
+ end
56
+
57
+ def icd3of4(floor = false)
58
+ i3o4 = @words_count / 3.0
59
+ i3o4 = i3o4.floor if floor
60
+ addon = floor ? @words_count % 3 : 0
61
+ ([i3o4, i3o4, i3o4, addon].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
62
+ end
63
+
64
+ def icdThc
65
+ icd3of4
66
+ end
67
+
68
+ def icdTlc
69
+ io = @words_count / 6.0
70
+ ([2*io, 2*io, io, io].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
71
+ end
72
+
73
+ def icd4of4(floor = false)
74
+ i4o4 = @words_count / 4.0
75
+ i4o4 = i4o4.floor if floor
76
+ ([i4o4, i4o4, i4o4, i4o4].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
77
+ end
78
+
79
+ protected
80
+ def infocod_private(position)
81
+ k_i = ['A','C','G','T'].collect { |letter| @matrix[letter][position] }
82
+ ( k_i.inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
83
+ end
84
+ end
85
+
86
+ class PPM
87
+ def to_pcm(words_count = nil)
88
+ @words_count = words_count if words_count
89
+ checkerr("words count is not specified") { !@words_count }
90
+ counts = PM.new_matrix(@size)
91
+ (0...size).each { |i|
92
+ ['A', 'C', 'G', 'T'].each { |l|
93
+ counts[l][i] = @matrix[l][i] * @words_count
94
+ }
95
+ }
96
+ return PM.new(size, counts)
97
+ end
98
+ alias to_pcm get_pcm
99
+
100
+ def infocod(position = nil)
101
+ return to_pcm.infocod(position)
102
+ end
103
+
104
+ def icd(position = nil)
105
+ return to_pcm.infocod(position)
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,92 @@
1
+ class IUPAC < String
2
+ CODE = {"A" => "A", "C" => "C", "G" => "G", "T" => "T",
3
+ "AG" => "R", "CT" => "Y", "GT" => "K", "AC" => "M",
4
+ "CG" => "S", "AT" => "W", "CGT" => "B", "AGT" => "D", "ACT" => "H", "ACG" => "V", "ACGT" => "N"}
5
+ REVCODE = CODE.invert
6
+
7
+ def dup
8
+ IUPAC.new(self)
9
+ end
10
+
11
+ def initialize(words)
12
+ if words.is_a?(Array)
13
+ iupac = (0...words[0].size).collect { |i|
14
+ (0...words.size).collect { |j| words[j][i,1] }.uniq.sort.inject("") { |cola, letter| cola += letter }
15
+ }.inject("") { |iup, cola|
16
+ checkerr("bad letter set #{cola}") { !CODE.has_key?(cola) }
17
+ iup += CODE[cola]
18
+ }
19
+ super(iupac)
20
+ elsif words.is_a?(IUPAC)
21
+ super(words)
22
+ elsif words.is_a?(String)
23
+ checkerr("word #{words} has strange characters") { words.tr('ACGTURYKMSWBDHVN', '').size > 0 }
24
+ super(words)
25
+ end
26
+ end
27
+
28
+ def ==(iupac)
29
+ return false if self.size != iupac.size
30
+ (0...self.size).inject(true) { |result, i| result &= IUPACOM[self[i,1]][iupac[i,1]] }
31
+ end
32
+
33
+ def merge(iupac)
34
+ return nil if self.size != iupac.size
35
+ res = (0...self.size).inject("") { |res, i|
36
+ merges = REVCODE[self[i,1]].split(//).concat(REVCODE[iupac[i,1]].split(//)).uniq.sort.inject("") { |s, c| s += c}
37
+ res << CODE[merges]
38
+ }
39
+ return IUPAC.new(res)
40
+ end
41
+
42
+ def include?(iupac)
43
+ return false if self.size < iupac.size || !iupac.is_a?(IUPAC)
44
+ (0..self.size-iupac.size).each { |i|
45
+ return i if IUPAC.new(self[i,iupac.size]) == iupac
46
+ }
47
+ return false
48
+ end
49
+
50
+ def compl
51
+ return self.tr("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
52
+ end
53
+
54
+ def compl!
55
+ self.tr!("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
56
+ return self
57
+ end
58
+
59
+ alias reverse_string reverse
60
+ def reverse
61
+ return IUPAC.new(reverse_string)
62
+ end
63
+
64
+ alias comp! compl!
65
+ alias complement! compl!
66
+ alias comp compl
67
+ alias complement compl
68
+
69
+ private
70
+ IUPACOM = { "A" => {"A" => :llib, "R" => :llib, "M" => :llib, "W" => :llib, "D" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
71
+ "C" => {"C" => :llib, "Y" => :llib, "M" => :llib, "S" => :llib, "B" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
72
+ "G" => {"G" => :llib, "R" => :llib, "K" => :llib, "S" => :llib, "B" => :llib, "D" => :llib, "V" => :llib, "N" => :llib},
73
+ "T" => {"T" => :llib, "Y" => :llib, "K" => :llib, "W" => :llib, "B" => :llib, "D" => :llib, "H" => :llib, "N" => :llib}
74
+ }
75
+ IUPACOM["R"] = IUPACOM["G"].merge(IUPACOM["A"])
76
+ IUPACOM["Y"] = IUPACOM["T"].merge(IUPACOM["C"])
77
+ IUPACOM["K"] = IUPACOM["G"].merge(IUPACOM["T"])
78
+ IUPACOM["M"] = IUPACOM["A"].merge(IUPACOM["C"])
79
+ IUPACOM["S"] = IUPACOM["G"].merge(IUPACOM["C"])
80
+ IUPACOM["W"] = IUPACOM["A"].merge(IUPACOM["T"])
81
+ IUPACOM["B"] = IUPACOM["G"].merge(IUPACOM["T"].merge(IUPACOM["C"]))
82
+ IUPACOM["D"] = IUPACOM["G"].merge(IUPACOM["A"].merge(IUPACOM["T"]))
83
+ IUPACOM["H"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["T"]))
84
+ IUPACOM["V"] = IUPACOM["G"].merge(IUPACOM["C"].merge(IUPACOM["A"]))
85
+ IUPACOM["N"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["G"].merge(IUPACOM["T"])))
86
+
87
+ # IUPACMERGE = CODE.merge({
88
+ # "AA" => "A", "CC" => "C", "GG" => "G", "TT" => "T",
89
+ #
90
+ # })
91
+
92
+ end
@@ -0,0 +1,562 @@
1
+ module Ytilib
2
+ class PM
3
+
4
+ attr_reader :matrix, :size
5
+ attr_accessor :words_count
6
+
7
+ alias length size
8
+
9
+ def score_mean(bckgr = Randoom::DEF_PROBS)
10
+ (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
11
+ end
12
+
13
+ def score_variance(bckgr = Randoom::DEF_PROBS)
14
+ (0...@size).inject(0.0) { |m2, i|
15
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
16
+ m2 += deltai
17
+ }
18
+ end
19
+
20
+ def p_value(threshold, mean = nil, variance = nil)
21
+ mean = mean ? mean : score_mean
22
+ variance = variance ? variance : score_variance
23
+ n_ = (threshold - mean) / Math.sqrt(variance)
24
+ p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
25
+ end
26
+
27
+ def best_word
28
+ return (0...size).inject("") { |word, i|
29
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
30
+ maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
31
+ word << (maxlets.size == 1 ? maxlets.first : "N")
32
+ }
33
+ end
34
+
35
+ def strict_consensus
36
+ return IUPAC.new((0...size).inject("") { |word, i|
37
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
38
+ maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
39
+ word += IUPAC::CODE[maxlets]
40
+ })
41
+ end
42
+
43
+ def consensus_string(beautiful = false)
44
+ checkerr("words count is undefined") { !@words_count }
45
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
46
+ icd = infocod
47
+
48
+ return String.new((0...size).inject("") { |word, i|
49
+
50
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
51
+
52
+ if icd[i] > i2o4
53
+ scores = [scores.first]
54
+ elsif icd[i] > thc
55
+ scores = scores[0..1]
56
+ elsif icd[i] > tlc
57
+ scores = scores[0..2]
58
+ end
59
+
60
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
61
+
62
+ reslet = IUPAC::CODE[lets]
63
+ reslet = reslet.downcase if beautiful && lets.size > 2
64
+
65
+ word += reslet
66
+ })
67
+ end
68
+
69
+ def consensus
70
+ checkerr("words count is undefined") { !@words_count }
71
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
72
+ icd = infocod
73
+
74
+ return IUPAC.new((0...size).inject("") { |word, i|
75
+
76
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
77
+
78
+ if icd[i] > i2o4
79
+ scores = [scores.first]
80
+ elsif icd[i] > thc
81
+ scores = scores[0..1]
82
+ elsif icd[i] > tlc
83
+ scores = scores[0..2]
84
+ end
85
+
86
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
87
+
88
+ word += IUPAC::CODE[lets]
89
+ })
90
+ end
91
+
92
+ def find_hit(s, score_g, use2strands = true)
93
+ (0..(s.size - @size)).each { |i|
94
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
95
+ score_p, score_rc = score(seq), score(seq_rc)
96
+ r = use2strands ? [score_p,score_rc].max : score_p
97
+ return i if r >= score_g
98
+ }
99
+ return nil
100
+ end
101
+
102
+ def find_hits(s, score_g, use2strands = true)
103
+ (0..(s.size - @size)).select { |i|
104
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
105
+ score_p, score_rc = score(seq), score(seq_rc)
106
+ r = use2strands ? [score_p,score_rc].max : score_p
107
+ r >= score_g ? i : nil
108
+ }.compact
109
+ end
110
+
111
+ def collect_hits(s, score_g, use2strands = true)
112
+ result = []
113
+ (0..(s.size - @size)).each { |i|
114
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
115
+ score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
116
+ result << [score_p, seq, false, i] if score_p >= score_g
117
+ result << [score_rc, seq_rc, true, i] if score_rc >= score_g
118
+ }
119
+ result
120
+ end
121
+
122
+ def best_hit(s, use2strands = true)
123
+
124
+ checkerr("too short sequence") { s.size < @size }
125
+ return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
126
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
127
+ score_p, score_rc = score(seq), score(seq_rc)
128
+ r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
129
+ }
130
+ end
131
+
132
+ def eql?(pm)
133
+ return ['A','C','G','T'].inject(true) { |equal, letter|
134
+ equal = equal && @matrix[letter].eql?(pm.matrix[letter])
135
+ }
136
+ end
137
+
138
+ def flexeql?(pm)
139
+ checkerr("for what?") { true }
140
+ return ['A','C','G','T'].inject(true) { |equal, letter|
141
+ # report "letter=#{letter}"
142
+ equal = equal && (0...@size).inject(true) { |deepequal, position|
143
+ # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
144
+ deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
145
+ }
146
+ }
147
+ end
148
+
149
+ def initialize(size, matrix = nil, words_count = nil)
150
+ checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
151
+ @size = size
152
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
153
+ if !words_count || words_count <= 0
154
+ words_count = col_sum(0)
155
+ @words_count = words_count.round >= 2 ? words_count.round : nil
156
+ else
157
+ @words_count = words_count
158
+ end
159
+ end
160
+
161
+ def col_sum(index = 0, letset = ['A','C','G','T'])
162
+ return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
163
+ end
164
+
165
+ def PM.col_sum(matrix, index = 0)
166
+ return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
167
+ end
168
+
169
+ def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
170
+ @words_count = words_count if words_count && words_count > 0
171
+
172
+ @matrix.each_key do |letter|
173
+ (0...@size).each { |pos|
174
+
175
+ #p "pcm"
176
+ #p @matrix[letter][pos]
177
+ #p @matrix[letter][pos] + (probs[letter] * pseudocount)
178
+ #p ( (@words_count + pseudocount) * probs[letter])
179
+ #exit
180
+
181
+ @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
182
+
183
+ }
184
+ end
185
+
186
+ return self
187
+ end
188
+
189
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
190
+ return self.dup.to_pwm!(words_count, probs, pseudocount)
191
+ end
192
+ alias to_pwm get_pwm
193
+
194
+ def get_ppm(words_count = nil)
195
+ words_count = @words_count unless words_count
196
+ checkerr("undefined words count") { !words_count || words_count <= 0 }
197
+ ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
198
+ @matrix.each_key { |letter|
199
+ (0...@size).each { |i|
200
+ ppm[letter][i] = @matrix[letter][i].to_f / words_count
201
+ }
202
+ }
203
+ return PPM.new(@size, ppm, words_count)
204
+ end
205
+ alias to_ppm get_ppm
206
+
207
+ def score(word)
208
+ checkerr("word size != pwm.size") { @size != word.size }
209
+ checkerr("word #{word} has strange characters") {
210
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
211
+ }
212
+ return (0...@size).inject(0) { |sum, i|
213
+ sum += @matrix[word[i,1]][i]
214
+ }
215
+ end
216
+
217
+ def best_score
218
+ return (0...size).inject(0) { |sum, i|
219
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
220
+ }
221
+ end
222
+
223
+ def worst_score
224
+ return (0...size).inject(0) { |sum, i|
225
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
226
+ }
227
+ end
228
+
229
+ def dup
230
+ new_matrix = {}
231
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
232
+ return PM.new(@size, new_matrix, @words_count)
233
+ end
234
+
235
+ def PM.new_pcm(words, iupacomp = false)
236
+ size = words[0].size
237
+ counts = PM.new_matrix(size)
238
+ counts.each_value { |arr| arr.fill(0) }
239
+ words.each { |word|
240
+ 0.upto(size-1) { |i|
241
+ letter = word[i,1].upcase
242
+ checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
243
+ if letter != 'N'
244
+ counts[letter][i] += 1
245
+ else
246
+ ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
247
+ end
248
+ }
249
+ }
250
+ newpcm = PM.new(size, counts, words.size)
251
+ newpcm.iupacomp! if iupacomp
252
+ return newpcm
253
+ end
254
+
255
+ def PM.new_pwm(words)
256
+ pcm = PM.new_pcm(words)
257
+ pcm.to_pwm!
258
+ return pcm
259
+ end
260
+
261
+ def PM.load(filename)
262
+ # supporting pat & pwm formats (letter-column and letter-row format)
263
+ input = IO.read(filename)
264
+ tm = []
265
+ input.each_line { |line|
266
+ l_a = line.split
267
+ begin
268
+ l_a = l_a.collect { |a_i| Float(a_i) }
269
+ rescue
270
+ next
271
+ end
272
+ tm << l_a
273
+ }
274
+ tm = tm.transpose if tm.size == 4
275
+ matrix = PM.new_matrix(tm.size)
276
+ tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
277
+
278
+ ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
279
+
280
+ return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
281
+ end
282
+
283
+ def save(filename)
284
+ File.open(filename, "w") { |out_f|
285
+ case File.ext_wo_name(filename)
286
+ when "pwm"
287
+ ['A', 'C', 'G', 'T'].each { |letter|
288
+ @matrix[letter].each { |e|
289
+ out_f << "#{e} "
290
+ }
291
+ out_f << $/
292
+ }
293
+ when "pat"
294
+ out_f.puts File.name_wo_ext(filename)
295
+ (0...@size).each { |i|
296
+ ['A', 'C', 'G', 'T'].each { |letter|
297
+ out_f << "#{@matrix[letter][i]} "
298
+ }
299
+ out_f << $/
300
+ }
301
+ when "xml"
302
+ checkerr("small-BiSMark is not supported at this moment")
303
+ else
304
+ checkerr("unknown motif file format specified")
305
+ end
306
+ }
307
+ end
308
+
309
+ def positiv!
310
+ min = @matrix.values.collect { |v| v.min }.min.abs
311
+ @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
312
+ return self
313
+ end
314
+
315
+ def revcomp!
316
+ @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
317
+ @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
318
+ @matrix.each_value { |v| v.reverse! }
319
+ self
320
+ end
321
+
322
+ def to_bismark(b)
323
+ pwm = @matrix['A'][0].is_a?(Float)
324
+ attributes = {"length" => @size}
325
+ attributes["words-count"] = @words_count if @words_count && @words_count > 0
326
+ pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
327
+ (0...@matrix['A'].size).each { |i|
328
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
329
+ ['A', 'C', 'G', 'T'].each { |l|
330
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
331
+ }
332
+ }
333
+ end
334
+
335
+ def PM.from_bismark(b, iupacomp = false)
336
+
337
+ checkerr("empty small-BiSMark file?") { !b }
338
+ float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
339
+ words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
340
+
341
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
342
+ b.elements.each("pm-column") { |pmc|
343
+ position = pmc.attributes["position"].to_i
344
+ ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
345
+ }
346
+ if b.name == "PPM"
347
+ newppm = PPM.new(matrix['A'].size, matrix, words_count)
348
+ newppm.iupacomp! if iupacomp
349
+ return newppm
350
+ end
351
+ if b.name == "PCM"
352
+ @words_count = col_sum(matrix)
353
+ newpcm = PM.new(matrix['A'].size, matrix, words_count)
354
+ newpcm.iupacomp! if iupacomp
355
+ return newpcm
356
+ end
357
+ if b.name == "PWM" && iupacomp
358
+ raise "cannot force IUPAC compatible PWM"
359
+ end
360
+ return PM.new(matrix['A'].size, matrix, words_count)
361
+ end
362
+
363
+ IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
364
+ def iupacomp!
365
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
366
+ # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
367
+
368
+ IUPAC_LS.each { |iul_ls|
369
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
370
+ }
371
+
372
+ return self
373
+ end
374
+
375
+ def m3sd(bckgr = Randoom::DEF_PROBS)
376
+
377
+ mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
378
+ dev = (0...@size).inject(0.0) { |m2, i|
379
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
380
+ m2 += deltai
381
+ }
382
+ sigma = Math.sqrt(dev)
383
+
384
+ mean+3*sigma
385
+ end
386
+
387
+ def fixwc
388
+ return unless @words_count
389
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max
390
+ end
391
+
392
+ protected
393
+ def PM.new_matrix(size)
394
+ return {
395
+ 'A' => Array.new(size),
396
+ 'C' => Array.new(size),
397
+ 'G' => Array.new(size),
398
+ 'T' => Array.new(size) }
399
+ end
400
+
401
+ def PM.new_matrix_iupac(size)
402
+ return {
403
+ 'A' => Array.new(size),
404
+ 'C' => Array.new(size),
405
+ 'G' => Array.new(size),
406
+ 'T' => Array.new(size),
407
+ 'R' => Array.new(size),
408
+ 'Y' => Array.new(size),
409
+ 'K' => Array.new(size),
410
+ 'M' => Array.new(size),
411
+ 'S' => Array.new(size),
412
+ 'W' => Array.new(size),
413
+ 'B' => Array.new(size),
414
+ 'D' => Array.new(size),
415
+ 'H' => Array.new(size),
416
+ 'V' => Array.new(size),
417
+ 'N' => Array.new(size)
418
+ }
419
+ end
420
+
421
+ end
422
+
423
+ class PPM < PM
424
+
425
+ #DEPRECATED, use iupacomp! instead
426
+ #def make_N_comp!
427
+ # @matrix['N'] = (0...size).collect { 0.25 }
428
+ # return self
429
+ #end
430
+
431
+ def initialize(size, matrix = nil, words_count = nil)
432
+ checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
433
+ @size = size
434
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
435
+ @words_count = words_count
436
+ end
437
+
438
+ def iupacomp!
439
+ @words_count = 4.0 unless @words_count
440
+
441
+ IUPAC_LS.each { |iul_ls|
442
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
443
+ }
444
+
445
+ return self
446
+ end
447
+
448
+ def score(word)
449
+ checkerr("word size != ppm.size") { @size != word.size }
450
+ checkerr("word #{word} has strange characters") {
451
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
452
+ }
453
+ return (0...@size).inject(1) { |mul, i|
454
+ mul *= @matrix[word[i,1]][i]
455
+ }
456
+ end
457
+
458
+ def best_score
459
+ return (0...size).inject(1) { |mul, i|
460
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
461
+ }
462
+ end
463
+
464
+ def worst_score
465
+ return (0...size).inject(0) { |mul, i|
466
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
467
+ }
468
+ end
469
+
470
+ def to_bismark(b)
471
+ attributes = {"length" => @size}
472
+ attributes["words-count"] = @words_count if @words_count
473
+ pe = b.add_element("PPM", attributes)
474
+ (0...@matrix['A'].size).each { |i|
475
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
476
+ ['A', 'C', 'G', 'T'].each { |l|
477
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
478
+ }
479
+ }
480
+ end
481
+
482
+ def PPM.probs2IUPAC!(probs)
483
+ IUPAC_LS.each { |iul_ls|
484
+ probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
485
+ }
486
+ return probs
487
+ end
488
+
489
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
490
+
491
+ probs = PPM.probs2IUPAC!(probs.dup)
492
+
493
+ words_count = @words_count if !words_count || words_count == 0
494
+ checkerr("undefined words count") { !words_count }
495
+
496
+ pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
497
+
498
+ @matrix.each_key do |letter|
499
+ (0...@size).each { |pos|
500
+
501
+ pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
502
+
503
+ }
504
+ end
505
+ return PM.new(@size, pwm, words_count)
506
+ #pcm = get_pcm(words_count)
507
+ #pcm.iupacomp! if @matrix['N']
508
+ #return pcm.to_pwm!(words_count, probs, pseudocount)
509
+ end
510
+ alias to_pwm get_pwm
511
+
512
+ def get_pwm0pc(probs = Randoom::DEF_PROBS)
513
+ new_matrix = {}
514
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
515
+ newpm = PM.new(@size, new_matrix, nil)
516
+
517
+ new_matrix.each_key do |letter|
518
+ (0...@size).each { |pos|
519
+ new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
520
+ }
521
+ end
522
+
523
+ return newpm
524
+ end
525
+
526
+ def to_pwm!
527
+ raise "cannot force PPM class to PWM, use to_pwm instead"
528
+ end
529
+
530
+ def get_pcm(words_count = nil)
531
+ words_count = @words_count unless words_count
532
+ checkerr("undefined words count") { !words_count }
533
+ counts = PM.new_matrix(@size)
534
+ (0...size).each { |i|
535
+ ['A', 'C', 'G', 'T'].each { |l|
536
+ counts[l][i] = @matrix[l][i] * words_count
537
+ }
538
+ }
539
+ newpcm = PM.new(size, counts, words_count).iupacomp!
540
+ return newpcm
541
+ end
542
+ alias to_pcm get_pcm
543
+
544
+ def PPM.from_IUPAC(iupac)
545
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
546
+
547
+ (0...iupac.size).each { |i|
548
+ matrix.each_key { |k| matrix[k] << 0.0 }
549
+ letters = IUPAC::REVCODE[iupac[i]]
550
+ (0...letters.size).each { |j|
551
+ matrix[letters[j]][-1] = 1.0/letters.size
552
+ }
553
+ }
554
+
555
+ newppm = PPM.new(iupac.size, matrix, 4.0)
556
+ newppm.iupacomp!
557
+
558
+ newppm
559
+ end
560
+
561
+ end
562
+ end