sequence_logo 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/ruby
2
+
3
+ module Ytilib
4
+ require "rexml/document"
5
+ include REXML
6
+
7
+ class Bismark < Document
8
+
9
+ def initialize(source = nil, add_dtd = false)
10
+ dtd = add_dtd ? "<!DOCTYPE smallbismark SYSTEM 'smallbismark.dtd'>#{$/}" : ""
11
+ source == nil ? super("<?xml version='1.0' encoding='UTF-8'?>#{$/}#{dtd}") : super(source)
12
+ super(IO.read(source)) if source != nil && root == nil
13
+ if source == nil
14
+ self.add_element("smallbismark")
15
+ # xmlns breaks XPath for a REXML library under Linux, strange, indeed
16
+ # self.add_element("smallbismark", {"xmlns" => "http://bioinform.imb.ac.ru/smallBiSMark/smallbismark.dtd"})
17
+ self.root.add_element("comment", {"name" => "WARNING"}).add_text("This is a draft version of small-BiSMark. Specification is the subject to change!")
18
+ end
19
+ end
20
+
21
+ def getXML
22
+ beautify
23
+ s = ""; write(s, 1, true)
24
+ s.rstrip!
25
+ return s
26
+ end
27
+ alias get_xml getXML
28
+
29
+ def get_pm(xpath)
30
+ pwmnode = self.elements[xpath]
31
+ pm = PM.new_pm(pwmnode.attribute("length").value.to_i)
32
+ toi = pwmnode.name == "PCM"
33
+ pwmnode.elements.each("pm-column") { |c|
34
+ position = c.attribute("position").value.to_i - 1
35
+ weights = [c.elements["a"].get_text.value.strip.to_f,
36
+ c.elements["c"].get_text.value.strip.to_f,
37
+ c.elements["g"].get_text.value.strip.to_f,
38
+ c.elements["t"].get_text.value.strip.to_f]
39
+ weights.collect { |w| w.to_i } if toi
40
+ pm['A'][position], pm['C'][position], pm['G'][position], pm['T'][position] = weights[0], weights[1], weights[2], weights[3]
41
+ }
42
+ return pm
43
+ end
44
+
45
+ private
46
+ CONTAIN_NO_TEXT = {
47
+ "segment" => :vasya_shmyak,
48
+ "group" => :vasya_shmyak,
49
+ "smallbismark" => :vasya_shmyak,
50
+ "motif" => :vasya_shmyak,
51
+ "PWM" => :vasya_shmyak,
52
+ "PCM" => :vasya_shmyak,
53
+ "PPM" => :vasya_shmyak,
54
+ "source" => :vasya_shmyak,
55
+ "factor" => :vasya_shmyak,
56
+ "pm-column" => :vasya_shmyak,
57
+ "word-list" => :vasya_shmyak}
58
+
59
+ def beautify(node = self)
60
+ if node == self
61
+ self.delete_if { |e| e.is_a?(Text) }
62
+ self.each { |e| beautify(e) }
63
+ else
64
+ node.delete_if { |e| e.is_a?(Text) } if node.respond_to?(:delete_if) && Bismark::CONTAIN_NO_TEXT.has_key?(node.name)
65
+ node.each { |e| beautify(e) } if node.respond_to?(:each)
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ end
@@ -0,0 +1,75 @@
1
+ require 'rexml/formatters/pretty'
2
+
3
+ module REXML
4
+ module Formatters
5
+ # The Transitive formatter writes an XML document that parses to an
6
+ # identical document as the source document. This means that no extra
7
+ # whitespace nodes are inserted, and whitespace within text nodes is
8
+ # preserved. Within these constraints, the document is pretty-printed,
9
+ # with whitespace inserted into the metadata to introduce formatting.
10
+ #
11
+ # Note that this is only useful if the original XML is not already
12
+ # formatted. Since this formatter does not alter whitespace nodes, the
13
+ # results of formatting already formatted XML will be odd.
14
+ class Transitive < Default
15
+ def initialize( indentation=2 )
16
+ @indentation = indentation
17
+ @level = 0
18
+ end
19
+
20
+ protected
21
+ def write_element( node, output )
22
+ output << "\n" << ' '*@level
23
+ output << "<#{node.expanded_name}"
24
+
25
+ node.attributes.each_attribute do |attr|
26
+ output << " "
27
+ attr.write( output )
28
+ end unless node.attributes.empty?
29
+
30
+ if node.children.empty?
31
+ output << "/>"
32
+ else
33
+ output << ">"
34
+ # If compact and all children are text, and if the formatted output
35
+ # is less than the specified width, then try to print everything on
36
+ # one line
37
+ skip = false
38
+ @level += @indentation
39
+
40
+ only_text = true
41
+
42
+ node.children.each { |child|
43
+ only_text = child.is_a?(REXML::Text) && only_text
44
+ write( child, output )
45
+ }
46
+ @level -= @indentation
47
+ output << "#{only_text ? "" : "\n" + ' '*@level}" << "</#{node.expanded_name}>"
48
+ end
49
+
50
+ end
51
+
52
+ def write_text( node, output )
53
+ output << node.to_s()
54
+ end
55
+ end
56
+ end
57
+
58
+ class Document
59
+ def write( output=$stdout, indent=-1, trans=false, ie_hack=false )
60
+ if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output)
61
+ output = Output.new( output, xml_decl.encoding )
62
+ end
63
+ formatter = if indent > -1
64
+ if trans
65
+ REXML::Formatters::Transitive.new( indent )
66
+ else
67
+ REXML::Formatters::Pretty.new( indent, ie_hack )
68
+ end
69
+ else
70
+ REXML::Formatters::Default.new( ie_hack )
71
+ end
72
+ formatter.write( self, output )
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/ruby
2
+
3
+ class Float
4
+ # Using Stieltjes formula from http://www.luschny.de/math/factorial/approx/SimpleCases.html
5
+ def log_fact
6
+ return 0.0 if self <= 1
7
+ a0 = 1.0/12
8
+ a1 = 1.0/30
9
+ a2 = 53.0/210
10
+ a3 = 195.0/371
11
+ a4 = 22999.0/22737
12
+ a5 = 29944523.0/19733142
13
+ a6 = 109535241009.0/48264275462
14
+ z_big = self+1;
15
+ (1.0/2)*Math.log(2*Math::PI)+(z_big-1.0/2)*Math.log(z_big)-z_big + a0/(z_big+a1/(z_big+a2/(z_big+a3/(z_big+a4/(z_big+a5/(z_big+a6/z_big))))))
16
+ end
17
+ end
18
+
19
+ class Integer
20
+ def log_fact
21
+ self.to_f.log_fact
22
+ end
23
+ end
24
+
25
+ # Naive version
26
+ =begin
27
+ class Integer
28
+ @@fact_hash = {}
29
+ def log_fact
30
+ return 0.0 if self == 0
31
+ return nil if self < 0
32
+ if self <= 170
33
+ @@fact_hash[self] = Math.log( lambda { |k| return k if self.times { |i| k *= i.next } }.call(1) )
34
+ else
35
+ return self.to_f.log_fact
36
+ end unless @@fact_hash.has_key?(self)
37
+ return @@fact_hash[self]
38
+ end
39
+ end
40
+ =end
41
+
42
+ module Ytilib
43
+ class PM
44
+ def infocod(position = nil)
45
+ return infocod_private(position) if position
46
+ (0...@size).collect { |i| infocod_private(i) }
47
+ end
48
+ alias icd infocod
49
+
50
+ def icd2of4(floor = false)
51
+ i2o4 = @words_count / 2.0
52
+ i2o4 = i2o4.floor if floor
53
+ ([i2o4, i2o4, 0, 0].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
54
+ # 0 is equal to @words_count % 2, because 0! = 1!
55
+ end
56
+
57
+ def icd3of4(floor = false)
58
+ i3o4 = @words_count / 3.0
59
+ i3o4 = i3o4.floor if floor
60
+ addon = floor ? @words_count % 3 : 0
61
+ ([i3o4, i3o4, i3o4, addon].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
62
+ end
63
+
64
+ def icdThc
65
+ icd3of4
66
+ end
67
+
68
+ def icdTlc
69
+ io = @words_count / 6.0
70
+ ([2*io, 2*io, io, io].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
71
+ end
72
+
73
+ def icd4of4(floor = false)
74
+ i4o4 = @words_count / 4.0
75
+ i4o4 = i4o4.floor if floor
76
+ ([i4o4, i4o4, i4o4, i4o4].inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
77
+ end
78
+
79
+ protected
80
+ def infocod_private(position)
81
+ k_i = ['A','C','G','T'].collect { |letter| @matrix[letter][position] }
82
+ ( k_i.inject(0.0) { |sum, k_i| sum += k_i.log_fact } - @words_count.log_fact ) / @words_count
83
+ end
84
+ end
85
+
86
+ class PPM
87
+ def to_pcm(words_count = nil)
88
+ @words_count = words_count if words_count
89
+ checkerr("words count is not specified") { !@words_count }
90
+ counts = PM.new_matrix(@size)
91
+ (0...size).each { |i|
92
+ ['A', 'C', 'G', 'T'].each { |l|
93
+ counts[l][i] = @matrix[l][i] * @words_count
94
+ }
95
+ }
96
+ return PM.new(size, counts)
97
+ end
98
+ alias to_pcm get_pcm
99
+
100
+ def infocod(position = nil)
101
+ return to_pcm.infocod(position)
102
+ end
103
+
104
+ def icd(position = nil)
105
+ return to_pcm.infocod(position)
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,92 @@
1
+ class IUPAC < String
2
+ CODE = {"A" => "A", "C" => "C", "G" => "G", "T" => "T",
3
+ "AG" => "R", "CT" => "Y", "GT" => "K", "AC" => "M",
4
+ "CG" => "S", "AT" => "W", "CGT" => "B", "AGT" => "D", "ACT" => "H", "ACG" => "V", "ACGT" => "N"}
5
+ REVCODE = CODE.invert
6
+
7
+ def dup
8
+ IUPAC.new(self)
9
+ end
10
+
11
+ def initialize(words)
12
+ if words.is_a?(Array)
13
+ iupac = (0...words[0].size).collect { |i|
14
+ (0...words.size).collect { |j| words[j][i,1] }.uniq.sort.inject("") { |cola, letter| cola += letter }
15
+ }.inject("") { |iup, cola|
16
+ checkerr("bad letter set #{cola}") { !CODE.has_key?(cola) }
17
+ iup += CODE[cola]
18
+ }
19
+ super(iupac)
20
+ elsif words.is_a?(IUPAC)
21
+ super(words)
22
+ elsif words.is_a?(String)
23
+ checkerr("word #{words} has strange characters") { words.tr('ACGTURYKMSWBDHVN', '').size > 0 }
24
+ super(words)
25
+ end
26
+ end
27
+
28
+ def ==(iupac)
29
+ return false if self.size != iupac.size
30
+ (0...self.size).inject(true) { |result, i| result &= IUPACOM[self[i,1]][iupac[i,1]] }
31
+ end
32
+
33
+ def merge(iupac)
34
+ return nil if self.size != iupac.size
35
+ res = (0...self.size).inject("") { |res, i|
36
+ merges = REVCODE[self[i,1]].split(//).concat(REVCODE[iupac[i,1]].split(//)).uniq.sort.inject("") { |s, c| s += c}
37
+ res << CODE[merges]
38
+ }
39
+ return IUPAC.new(res)
40
+ end
41
+
42
+ def include?(iupac)
43
+ return false if self.size < iupac.size || !iupac.is_a?(IUPAC)
44
+ (0..self.size-iupac.size).each { |i|
45
+ return i if IUPAC.new(self[i,iupac.size]) == iupac
46
+ }
47
+ return false
48
+ end
49
+
50
+ def compl
51
+ return self.tr("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
52
+ end
53
+
54
+ def compl!
55
+ self.tr!("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
56
+ return self
57
+ end
58
+
59
+ alias reverse_string reverse
60
+ def reverse
61
+ return IUPAC.new(reverse_string)
62
+ end
63
+
64
+ alias comp! compl!
65
+ alias complement! compl!
66
+ alias comp compl
67
+ alias complement compl
68
+
69
+ private
70
+ IUPACOM = { "A" => {"A" => :llib, "R" => :llib, "M" => :llib, "W" => :llib, "D" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
71
+ "C" => {"C" => :llib, "Y" => :llib, "M" => :llib, "S" => :llib, "B" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
72
+ "G" => {"G" => :llib, "R" => :llib, "K" => :llib, "S" => :llib, "B" => :llib, "D" => :llib, "V" => :llib, "N" => :llib},
73
+ "T" => {"T" => :llib, "Y" => :llib, "K" => :llib, "W" => :llib, "B" => :llib, "D" => :llib, "H" => :llib, "N" => :llib}
74
+ }
75
+ IUPACOM["R"] = IUPACOM["G"].merge(IUPACOM["A"])
76
+ IUPACOM["Y"] = IUPACOM["T"].merge(IUPACOM["C"])
77
+ IUPACOM["K"] = IUPACOM["G"].merge(IUPACOM["T"])
78
+ IUPACOM["M"] = IUPACOM["A"].merge(IUPACOM["C"])
79
+ IUPACOM["S"] = IUPACOM["G"].merge(IUPACOM["C"])
80
+ IUPACOM["W"] = IUPACOM["A"].merge(IUPACOM["T"])
81
+ IUPACOM["B"] = IUPACOM["G"].merge(IUPACOM["T"].merge(IUPACOM["C"]))
82
+ IUPACOM["D"] = IUPACOM["G"].merge(IUPACOM["A"].merge(IUPACOM["T"]))
83
+ IUPACOM["H"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["T"]))
84
+ IUPACOM["V"] = IUPACOM["G"].merge(IUPACOM["C"].merge(IUPACOM["A"]))
85
+ IUPACOM["N"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["G"].merge(IUPACOM["T"])))
86
+
87
+ # IUPACMERGE = CODE.merge({
88
+ # "AA" => "A", "CC" => "C", "GG" => "G", "TT" => "T",
89
+ #
90
+ # })
91
+
92
+ end
@@ -0,0 +1,562 @@
1
+ module Ytilib
2
+ class PM
3
+
4
+ attr_reader :matrix, :size
5
+ attr_accessor :words_count
6
+
7
+ alias length size
8
+
9
+ def score_mean(bckgr = Randoom::DEF_PROBS)
10
+ (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
11
+ end
12
+
13
+ def score_variance(bckgr = Randoom::DEF_PROBS)
14
+ (0...@size).inject(0.0) { |m2, i|
15
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
16
+ m2 += deltai
17
+ }
18
+ end
19
+
20
+ def p_value(threshold, mean = nil, variance = nil)
21
+ mean = mean ? mean : score_mean
22
+ variance = variance ? variance : score_variance
23
+ n_ = (threshold - mean) / Math.sqrt(variance)
24
+ p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
25
+ end
26
+
27
+ def best_word
28
+ return (0...size).inject("") { |word, i|
29
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
30
+ maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
31
+ word << (maxlets.size == 1 ? maxlets.first : "N")
32
+ }
33
+ end
34
+
35
+ def strict_consensus
36
+ return IUPAC.new((0...size).inject("") { |word, i|
37
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
38
+ maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
39
+ word += IUPAC::CODE[maxlets]
40
+ })
41
+ end
42
+
43
+ def consensus_string(beautiful = false)
44
+ checkerr("words count is undefined") { !@words_count }
45
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
46
+ icd = infocod
47
+
48
+ return String.new((0...size).inject("") { |word, i|
49
+
50
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
51
+
52
+ if icd[i] > i2o4
53
+ scores = [scores.first]
54
+ elsif icd[i] > thc
55
+ scores = scores[0..1]
56
+ elsif icd[i] > tlc
57
+ scores = scores[0..2]
58
+ end
59
+
60
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
61
+
62
+ reslet = IUPAC::CODE[lets]
63
+ reslet = reslet.downcase if beautiful && lets.size > 2
64
+
65
+ word += reslet
66
+ })
67
+ end
68
+
69
+ def consensus
70
+ checkerr("words count is undefined") { !@words_count }
71
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
72
+ icd = infocod
73
+
74
+ return IUPAC.new((0...size).inject("") { |word, i|
75
+
76
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
77
+
78
+ if icd[i] > i2o4
79
+ scores = [scores.first]
80
+ elsif icd[i] > thc
81
+ scores = scores[0..1]
82
+ elsif icd[i] > tlc
83
+ scores = scores[0..2]
84
+ end
85
+
86
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
87
+
88
+ word += IUPAC::CODE[lets]
89
+ })
90
+ end
91
+
92
+ def find_hit(s, score_g, use2strands = true)
93
+ (0..(s.size - @size)).each { |i|
94
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
95
+ score_p, score_rc = score(seq), score(seq_rc)
96
+ r = use2strands ? [score_p,score_rc].max : score_p
97
+ return i if r >= score_g
98
+ }
99
+ return nil
100
+ end
101
+
102
+ def find_hits(s, score_g, use2strands = true)
103
+ (0..(s.size - @size)).select { |i|
104
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
105
+ score_p, score_rc = score(seq), score(seq_rc)
106
+ r = use2strands ? [score_p,score_rc].max : score_p
107
+ r >= score_g ? i : nil
108
+ }.compact
109
+ end
110
+
111
+ def collect_hits(s, score_g, use2strands = true)
112
+ result = []
113
+ (0..(s.size - @size)).each { |i|
114
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
115
+ score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
116
+ result << [score_p, seq, false, i] if score_p >= score_g
117
+ result << [score_rc, seq_rc, true, i] if score_rc >= score_g
118
+ }
119
+ result
120
+ end
121
+
122
+ def best_hit(s, use2strands = true)
123
+
124
+ checkerr("too short sequence") { s.size < @size }
125
+ return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
126
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
127
+ score_p, score_rc = score(seq), score(seq_rc)
128
+ r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
129
+ }
130
+ end
131
+
132
+ def eql?(pm)
133
+ return ['A','C','G','T'].inject(true) { |equal, letter|
134
+ equal = equal && @matrix[letter].eql?(pm.matrix[letter])
135
+ }
136
+ end
137
+
138
+ def flexeql?(pm)
139
+ checkerr("for what?") { true }
140
+ return ['A','C','G','T'].inject(true) { |equal, letter|
141
+ # report "letter=#{letter}"
142
+ equal = equal && (0...@size).inject(true) { |deepequal, position|
143
+ # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
144
+ deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
145
+ }
146
+ }
147
+ end
148
+
149
+ def initialize(size, matrix = nil, words_count = nil)
150
+ checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
151
+ @size = size
152
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
153
+ if !words_count || words_count <= 0
154
+ words_count = col_sum(0)
155
+ @words_count = words_count.round >= 2 ? words_count.round : nil
156
+ else
157
+ @words_count = words_count
158
+ end
159
+ end
160
+
161
+ def col_sum(index = 0, letset = ['A','C','G','T'])
162
+ return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
163
+ end
164
+
165
+ def PM.col_sum(matrix, index = 0)
166
+ return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
167
+ end
168
+
169
+ def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
170
+ @words_count = words_count if words_count && words_count > 0
171
+
172
+ @matrix.each_key do |letter|
173
+ (0...@size).each { |pos|
174
+
175
+ #p "pcm"
176
+ #p @matrix[letter][pos]
177
+ #p @matrix[letter][pos] + (probs[letter] * pseudocount)
178
+ #p ( (@words_count + pseudocount) * probs[letter])
179
+ #exit
180
+
181
+ @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
182
+
183
+ }
184
+ end
185
+
186
+ return self
187
+ end
188
+
189
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
190
+ return self.dup.to_pwm!(words_count, probs, pseudocount)
191
+ end
192
+ alias to_pwm get_pwm
193
+
194
+ def get_ppm(words_count = nil)
195
+ words_count = @words_count unless words_count
196
+ checkerr("undefined words count") { !words_count || words_count <= 0 }
197
+ ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
198
+ @matrix.each_key { |letter|
199
+ (0...@size).each { |i|
200
+ ppm[letter][i] = @matrix[letter][i].to_f / words_count
201
+ }
202
+ }
203
+ return PPM.new(@size, ppm, words_count)
204
+ end
205
+ alias to_ppm get_ppm
206
+
207
+ def score(word)
208
+ checkerr("word size != pwm.size") { @size != word.size }
209
+ checkerr("word #{word} has strange characters") {
210
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
211
+ }
212
+ return (0...@size).inject(0) { |sum, i|
213
+ sum += @matrix[word[i,1]][i]
214
+ }
215
+ end
216
+
217
+ def best_score
218
+ return (0...size).inject(0) { |sum, i|
219
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
220
+ }
221
+ end
222
+
223
+ def worst_score
224
+ return (0...size).inject(0) { |sum, i|
225
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
226
+ }
227
+ end
228
+
229
+ def dup
230
+ new_matrix = {}
231
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
232
+ return PM.new(@size, new_matrix, @words_count)
233
+ end
234
+
235
+ def PM.new_pcm(words, iupacomp = false)
236
+ size = words[0].size
237
+ counts = PM.new_matrix(size)
238
+ counts.each_value { |arr| arr.fill(0) }
239
+ words.each { |word|
240
+ 0.upto(size-1) { |i|
241
+ letter = word[i,1].upcase
242
+ checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
243
+ if letter != 'N'
244
+ counts[letter][i] += 1
245
+ else
246
+ ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
247
+ end
248
+ }
249
+ }
250
+ newpcm = PM.new(size, counts, words.size)
251
+ newpcm.iupacomp! if iupacomp
252
+ return newpcm
253
+ end
254
+
255
+ def PM.new_pwm(words)
256
+ pcm = PM.new_pcm(words)
257
+ pcm.to_pwm!
258
+ return pcm
259
+ end
260
+
261
+ def PM.load(filename)
262
+ # supporting pat & pwm formats (letter-column and letter-row format)
263
+ input = IO.read(filename)
264
+ tm = []
265
+ input.each_line { |line|
266
+ l_a = line.split
267
+ begin
268
+ l_a = l_a.collect { |a_i| Float(a_i) }
269
+ rescue
270
+ next
271
+ end
272
+ tm << l_a
273
+ }
274
+ tm = tm.transpose if tm.size == 4
275
+ matrix = PM.new_matrix(tm.size)
276
+ tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
277
+
278
+ ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
279
+
280
+ return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
281
+ end
282
+
283
+ def save(filename)
284
+ File.open(filename, "w") { |out_f|
285
+ case File.ext_wo_name(filename)
286
+ when "pwm"
287
+ ['A', 'C', 'G', 'T'].each { |letter|
288
+ @matrix[letter].each { |e|
289
+ out_f << "#{e} "
290
+ }
291
+ out_f << $/
292
+ }
293
+ when "pat"
294
+ out_f.puts File.name_wo_ext(filename)
295
+ (0...@size).each { |i|
296
+ ['A', 'C', 'G', 'T'].each { |letter|
297
+ out_f << "#{@matrix[letter][i]} "
298
+ }
299
+ out_f << $/
300
+ }
301
+ when "xml"
302
+ checkerr("small-BiSMark is not supported at this moment")
303
+ else
304
+ checkerr("unknown motif file format specified")
305
+ end
306
+ }
307
+ end
308
+
309
+ def positiv!
310
+ min = @matrix.values.collect { |v| v.min }.min.abs
311
+ @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
312
+ return self
313
+ end
314
+
315
+ def revcomp!
316
+ @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
317
+ @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
318
+ @matrix.each_value { |v| v.reverse! }
319
+ self
320
+ end
321
+
322
+ def to_bismark(b)
323
+ pwm = @matrix['A'][0].is_a?(Float)
324
+ attributes = {"length" => @size}
325
+ attributes["words-count"] = @words_count if @words_count && @words_count > 0
326
+ pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
327
+ (0...@matrix['A'].size).each { |i|
328
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
329
+ ['A', 'C', 'G', 'T'].each { |l|
330
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
331
+ }
332
+ }
333
+ end
334
+
335
+ def PM.from_bismark(b, iupacomp = false)
336
+
337
+ checkerr("empty small-BiSMark file?") { !b }
338
+ float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
339
+ words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
340
+
341
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
342
+ b.elements.each("pm-column") { |pmc|
343
+ position = pmc.attributes["position"].to_i
344
+ ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
345
+ }
346
+ if b.name == "PPM"
347
+ newppm = PPM.new(matrix['A'].size, matrix, words_count)
348
+ newppm.iupacomp! if iupacomp
349
+ return newppm
350
+ end
351
+ if b.name == "PCM"
352
+ @words_count = col_sum(matrix)
353
+ newpcm = PM.new(matrix['A'].size, matrix, words_count)
354
+ newpcm.iupacomp! if iupacomp
355
+ return newpcm
356
+ end
357
+ if b.name == "PWM" && iupacomp
358
+ raise "cannot force IUPAC compatible PWM"
359
+ end
360
+ return PM.new(matrix['A'].size, matrix, words_count)
361
+ end
362
+
363
+ IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
364
+ def iupacomp!
365
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
366
+ # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
367
+
368
+ IUPAC_LS.each { |iul_ls|
369
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
370
+ }
371
+
372
+ return self
373
+ end
374
+
375
+ def m3sd(bckgr = Randoom::DEF_PROBS)
376
+
377
+ mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
378
+ dev = (0...@size).inject(0.0) { |m2, i|
379
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
380
+ m2 += deltai
381
+ }
382
+ sigma = Math.sqrt(dev)
383
+
384
+ mean+3*sigma
385
+ end
386
+
387
+ def fixwc
388
+ return unless @words_count
389
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max
390
+ end
391
+
392
+ protected
393
+ def PM.new_matrix(size)
394
+ return {
395
+ 'A' => Array.new(size),
396
+ 'C' => Array.new(size),
397
+ 'G' => Array.new(size),
398
+ 'T' => Array.new(size) }
399
+ end
400
+
401
+ def PM.new_matrix_iupac(size)
402
+ return {
403
+ 'A' => Array.new(size),
404
+ 'C' => Array.new(size),
405
+ 'G' => Array.new(size),
406
+ 'T' => Array.new(size),
407
+ 'R' => Array.new(size),
408
+ 'Y' => Array.new(size),
409
+ 'K' => Array.new(size),
410
+ 'M' => Array.new(size),
411
+ 'S' => Array.new(size),
412
+ 'W' => Array.new(size),
413
+ 'B' => Array.new(size),
414
+ 'D' => Array.new(size),
415
+ 'H' => Array.new(size),
416
+ 'V' => Array.new(size),
417
+ 'N' => Array.new(size)
418
+ }
419
+ end
420
+
421
+ end
422
+
423
+ class PPM < PM
424
+
425
+ #DEPRECATED, use iupacomp! instead
426
+ #def make_N_comp!
427
+ # @matrix['N'] = (0...size).collect { 0.25 }
428
+ # return self
429
+ #end
430
+
431
+ def initialize(size, matrix = nil, words_count = nil)
432
+ checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
433
+ @size = size
434
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
435
+ @words_count = words_count
436
+ end
437
+
438
+ def iupacomp!
439
+ @words_count = 4.0 unless @words_count
440
+
441
+ IUPAC_LS.each { |iul_ls|
442
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
443
+ }
444
+
445
+ return self
446
+ end
447
+
448
+ def score(word)
449
+ checkerr("word size != ppm.size") { @size != word.size }
450
+ checkerr("word #{word} has strange characters") {
451
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
452
+ }
453
+ return (0...@size).inject(1) { |mul, i|
454
+ mul *= @matrix[word[i,1]][i]
455
+ }
456
+ end
457
+
458
+ def best_score
459
+ return (0...size).inject(1) { |mul, i|
460
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
461
+ }
462
+ end
463
+
464
+ def worst_score
465
+ return (0...size).inject(0) { |mul, i|
466
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
467
+ }
468
+ end
469
+
470
+ def to_bismark(b)
471
+ attributes = {"length" => @size}
472
+ attributes["words-count"] = @words_count if @words_count
473
+ pe = b.add_element("PPM", attributes)
474
+ (0...@matrix['A'].size).each { |i|
475
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
476
+ ['A', 'C', 'G', 'T'].each { |l|
477
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
478
+ }
479
+ }
480
+ end
481
+
482
+ def PPM.probs2IUPAC!(probs)
483
+ IUPAC_LS.each { |iul_ls|
484
+ probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
485
+ }
486
+ return probs
487
+ end
488
+
489
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
490
+
491
+ probs = PPM.probs2IUPAC!(probs.dup)
492
+
493
+ words_count = @words_count if !words_count || words_count == 0
494
+ checkerr("undefined words count") { !words_count }
495
+
496
+ pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
497
+
498
+ @matrix.each_key do |letter|
499
+ (0...@size).each { |pos|
500
+
501
+ pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
502
+
503
+ }
504
+ end
505
+ return PM.new(@size, pwm, words_count)
506
+ #pcm = get_pcm(words_count)
507
+ #pcm.iupacomp! if @matrix['N']
508
+ #return pcm.to_pwm!(words_count, probs, pseudocount)
509
+ end
510
+ alias to_pwm get_pwm
511
+
512
+ def get_pwm0pc(probs = Randoom::DEF_PROBS)
513
+ new_matrix = {}
514
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
515
+ newpm = PM.new(@size, new_matrix, nil)
516
+
517
+ new_matrix.each_key do |letter|
518
+ (0...@size).each { |pos|
519
+ new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
520
+ }
521
+ end
522
+
523
+ return newpm
524
+ end
525
+
526
+ def to_pwm!
527
+ raise "cannot force PPM class to PWM, use to_pwm instead"
528
+ end
529
+
530
+ def get_pcm(words_count = nil)
531
+ words_count = @words_count unless words_count
532
+ checkerr("undefined words count") { !words_count }
533
+ counts = PM.new_matrix(@size)
534
+ (0...size).each { |i|
535
+ ['A', 'C', 'G', 'T'].each { |l|
536
+ counts[l][i] = @matrix[l][i] * words_count
537
+ }
538
+ }
539
+ newpcm = PM.new(size, counts, words_count).iupacomp!
540
+ return newpcm
541
+ end
542
+ alias to_pcm get_pcm
543
+
544
+ def PPM.from_IUPAC(iupac)
545
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
546
+
547
+ (0...iupac.size).each { |i|
548
+ matrix.each_key { |k| matrix[k] << 0.0 }
549
+ letters = IUPAC::REVCODE[iupac[i]]
550
+ (0...letters.size).each { |j|
551
+ matrix[letters[j]][-1] = 1.0/letters.size
552
+ }
553
+ }
554
+
555
+ newppm = PPM.new(iupac.size, matrix, 4.0)
556
+ newppm.iupacomp!
557
+
558
+ newppm
559
+ end
560
+
561
+ end
562
+ end