sequence_logo 1.1.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/TODO.txt +2 -0
- data/lib/sequence_logo/canvases/logo_canvas.rb +1 -1
- data/lib/sequence_logo/cli.rb +0 -3
- data/lib/sequence_logo/data_models/ppm_logo.rb +6 -12
- data/lib/sequence_logo/di_pm.rb +99 -0
- data/lib/sequence_logo/exec/glue_logos.rb +2 -4
- data/lib/sequence_logo/exec/sequence_logo.rb +10 -5
- data/lib/sequence_logo/magick_support.rb +1 -1
- data/lib/sequence_logo/pmflogo_lib.rb +2 -1
- data/lib/sequence_logo/ppm_support.rb +81 -0
- data/lib/sequence_logo/version.rb +1 -1
- data/lib/sequence_logo/ytilib/ppm_support.rb +0 -70
- metadata +5 -13
- data/lib/sequence_logo/ytilib.rb +0 -10
- data/lib/sequence_logo/ytilib/addon.rb +0 -247
- data/lib/sequence_logo/ytilib/bismark.rb +0 -71
- data/lib/sequence_logo/ytilib/hack1.rb +0 -75
- data/lib/sequence_logo/ytilib/infocod.rb +0 -108
- data/lib/sequence_logo/ytilib/iupac.rb +0 -92
- data/lib/sequence_logo/ytilib/pm.rb +0 -574
- data/lib/sequence_logo/ytilib/pmsd.rb +0 -99
- data/lib/sequence_logo/ytilib/randoom.rb +0 -131
- data/lib/sequence_logo/ytilib/ytilib.rb +0 -147
@@ -1,92 +0,0 @@
|
|
1
|
-
class IUPAC < String
|
2
|
-
CODE = {"A" => "A", "C" => "C", "G" => "G", "T" => "T",
|
3
|
-
"AG" => "R", "CT" => "Y", "GT" => "K", "AC" => "M",
|
4
|
-
"CG" => "S", "AT" => "W", "CGT" => "B", "AGT" => "D", "ACT" => "H", "ACG" => "V", "ACGT" => "N"}
|
5
|
-
REVCODE = CODE.invert
|
6
|
-
|
7
|
-
def dup
|
8
|
-
IUPAC.new(self)
|
9
|
-
end
|
10
|
-
|
11
|
-
def initialize(words)
|
12
|
-
if words.is_a?(Array)
|
13
|
-
iupac = (0...words[0].size).collect { |i|
|
14
|
-
(0...words.size).collect { |j| words[j][i,1] }.uniq.sort.inject("") { |cola, letter| cola += letter }
|
15
|
-
}.inject("") { |iup, cola|
|
16
|
-
checkerr("bad letter set #{cola}") { !CODE.has_key?(cola) }
|
17
|
-
iup += CODE[cola]
|
18
|
-
}
|
19
|
-
super(iupac)
|
20
|
-
elsif words.is_a?(IUPAC)
|
21
|
-
super(words)
|
22
|
-
elsif words.is_a?(String)
|
23
|
-
checkerr("word #{words} has strange characters") { words.tr('ACGTURYKMSWBDHVN', '').size > 0 }
|
24
|
-
super(words)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def ==(iupac)
|
29
|
-
return false if self.size != iupac.size
|
30
|
-
(0...self.size).inject(true) { |result, i| result &= IUPACOM[self[i,1]][iupac[i,1]] }
|
31
|
-
end
|
32
|
-
|
33
|
-
def merge(iupac)
|
34
|
-
return nil if self.size != iupac.size
|
35
|
-
res = (0...self.size).inject("") { |res, i|
|
36
|
-
merges = REVCODE[self[i,1]].split(//).concat(REVCODE[iupac[i,1]].split(//)).uniq.sort.inject("") { |s, c| s += c}
|
37
|
-
res << CODE[merges]
|
38
|
-
}
|
39
|
-
return IUPAC.new(res)
|
40
|
-
end
|
41
|
-
|
42
|
-
def include?(iupac)
|
43
|
-
return false if self.size < iupac.size || !iupac.is_a?(IUPAC)
|
44
|
-
(0..self.size-iupac.size).each { |i|
|
45
|
-
return i if IUPAC.new(self[i,iupac.size]) == iupac
|
46
|
-
}
|
47
|
-
return false
|
48
|
-
end
|
49
|
-
|
50
|
-
def compl
|
51
|
-
return self.tr("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
|
52
|
-
end
|
53
|
-
|
54
|
-
def compl!
|
55
|
-
self.tr!("ACGTRYKMSWBDHVN", "TGCAYRMKSWVHDBN")
|
56
|
-
return self
|
57
|
-
end
|
58
|
-
|
59
|
-
alias reverse_string reverse
|
60
|
-
def reverse
|
61
|
-
return IUPAC.new(reverse_string)
|
62
|
-
end
|
63
|
-
|
64
|
-
alias comp! compl!
|
65
|
-
alias complement! compl!
|
66
|
-
alias comp compl
|
67
|
-
alias complement compl
|
68
|
-
|
69
|
-
private
|
70
|
-
IUPACOM = { "A" => {"A" => :llib, "R" => :llib, "M" => :llib, "W" => :llib, "D" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
|
71
|
-
"C" => {"C" => :llib, "Y" => :llib, "M" => :llib, "S" => :llib, "B" => :llib, "H" => :llib, "V" => :llib, "N" => :llib},
|
72
|
-
"G" => {"G" => :llib, "R" => :llib, "K" => :llib, "S" => :llib, "B" => :llib, "D" => :llib, "V" => :llib, "N" => :llib},
|
73
|
-
"T" => {"T" => :llib, "Y" => :llib, "K" => :llib, "W" => :llib, "B" => :llib, "D" => :llib, "H" => :llib, "N" => :llib}
|
74
|
-
}
|
75
|
-
IUPACOM["R"] = IUPACOM["G"].merge(IUPACOM["A"])
|
76
|
-
IUPACOM["Y"] = IUPACOM["T"].merge(IUPACOM["C"])
|
77
|
-
IUPACOM["K"] = IUPACOM["G"].merge(IUPACOM["T"])
|
78
|
-
IUPACOM["M"] = IUPACOM["A"].merge(IUPACOM["C"])
|
79
|
-
IUPACOM["S"] = IUPACOM["G"].merge(IUPACOM["C"])
|
80
|
-
IUPACOM["W"] = IUPACOM["A"].merge(IUPACOM["T"])
|
81
|
-
IUPACOM["B"] = IUPACOM["G"].merge(IUPACOM["T"].merge(IUPACOM["C"]))
|
82
|
-
IUPACOM["D"] = IUPACOM["G"].merge(IUPACOM["A"].merge(IUPACOM["T"]))
|
83
|
-
IUPACOM["H"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["T"]))
|
84
|
-
IUPACOM["V"] = IUPACOM["G"].merge(IUPACOM["C"].merge(IUPACOM["A"]))
|
85
|
-
IUPACOM["N"] = IUPACOM["A"].merge(IUPACOM["C"].merge(IUPACOM["G"].merge(IUPACOM["T"])))
|
86
|
-
|
87
|
-
# IUPACMERGE = CODE.merge({
|
88
|
-
# "AA" => "A", "CC" => "C", "GG" => "G", "TT" => "T",
|
89
|
-
#
|
90
|
-
# })
|
91
|
-
|
92
|
-
end
|
@@ -1,574 +0,0 @@
|
|
1
|
-
module Ytilib
|
2
|
-
class PM
|
3
|
-
|
4
|
-
attr_reader :matrix, :size
|
5
|
-
attr_accessor :words_count
|
6
|
-
|
7
|
-
alias length size
|
8
|
-
|
9
|
-
def each_position_index(&block)
|
10
|
-
@matrix['A'].each_index(&block)
|
11
|
-
end
|
12
|
-
|
13
|
-
def each_position(&block)
|
14
|
-
return enum_for(:each_position) unless block_given?
|
15
|
-
@matrix['A'].each_index do |i|
|
16
|
-
position = ['A', 'C', 'G', 'T'].map{|letter| @matrix[letter][i] }
|
17
|
-
yield position
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def score_mean(bckgr = Randoom::DEF_PROBS)
|
22
|
-
(0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
23
|
-
end
|
24
|
-
|
25
|
-
def score_variance(bckgr = Randoom::DEF_PROBS)
|
26
|
-
(0...@size).inject(0.0) { |m2, i|
|
27
|
-
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
28
|
-
m2 += deltai
|
29
|
-
}
|
30
|
-
end
|
31
|
-
|
32
|
-
def p_value(threshold, mean = nil, variance = nil)
|
33
|
-
mean = mean ? mean : score_mean
|
34
|
-
variance = variance ? variance : score_variance
|
35
|
-
n_ = (threshold - mean) / Math.sqrt(variance)
|
36
|
-
p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
|
37
|
-
end
|
38
|
-
|
39
|
-
def best_word
|
40
|
-
return (0...size).inject("") { |word, i|
|
41
|
-
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
42
|
-
maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
|
43
|
-
word << (maxlets.size == 1 ? maxlets.first : "N")
|
44
|
-
}
|
45
|
-
end
|
46
|
-
|
47
|
-
def strict_consensus
|
48
|
-
return IUPAC.new((0...size).inject("") { |word, i|
|
49
|
-
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
50
|
-
maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
|
51
|
-
word += IUPAC::CODE[maxlets]
|
52
|
-
})
|
53
|
-
end
|
54
|
-
|
55
|
-
def consensus_string(beautiful = false)
|
56
|
-
checkerr("words count is undefined") { !@words_count }
|
57
|
-
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
58
|
-
icd = infocod
|
59
|
-
|
60
|
-
return String.new((0...size).inject("") { |word, i|
|
61
|
-
|
62
|
-
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
63
|
-
|
64
|
-
if icd[i] > i2o4
|
65
|
-
scores = [scores.first]
|
66
|
-
elsif icd[i] > thc
|
67
|
-
scores = scores[0..1]
|
68
|
-
elsif icd[i] > tlc
|
69
|
-
scores = scores[0..2]
|
70
|
-
end
|
71
|
-
|
72
|
-
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
73
|
-
|
74
|
-
reslet = IUPAC::CODE[lets]
|
75
|
-
reslet = reslet.downcase if beautiful && lets.size > 2
|
76
|
-
|
77
|
-
word += reslet
|
78
|
-
})
|
79
|
-
end
|
80
|
-
|
81
|
-
def consensus
|
82
|
-
checkerr("words count is undefined") { !@words_count }
|
83
|
-
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
84
|
-
icd = infocod
|
85
|
-
|
86
|
-
return IUPAC.new((0...size).inject("") { |word, i|
|
87
|
-
|
88
|
-
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
89
|
-
|
90
|
-
if icd[i] > i2o4
|
91
|
-
scores = [scores.first]
|
92
|
-
elsif icd[i] > thc
|
93
|
-
scores = scores[0..1]
|
94
|
-
elsif icd[i] > tlc
|
95
|
-
scores = scores[0..2]
|
96
|
-
end
|
97
|
-
|
98
|
-
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
99
|
-
|
100
|
-
word += IUPAC::CODE[lets]
|
101
|
-
})
|
102
|
-
end
|
103
|
-
|
104
|
-
def find_hit(s, score_g, use2strands = true)
|
105
|
-
(0..(s.size - @size)).each { |i|
|
106
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
107
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
108
|
-
r = use2strands ? [score_p,score_rc].max : score_p
|
109
|
-
return i if r >= score_g
|
110
|
-
}
|
111
|
-
return nil
|
112
|
-
end
|
113
|
-
|
114
|
-
def find_hits(s, score_g, use2strands = true)
|
115
|
-
(0..(s.size - @size)).select { |i|
|
116
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
117
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
118
|
-
r = use2strands ? [score_p,score_rc].max : score_p
|
119
|
-
r >= score_g ? i : nil
|
120
|
-
}.compact
|
121
|
-
end
|
122
|
-
|
123
|
-
def collect_hits(s, score_g, use2strands = true)
|
124
|
-
result = []
|
125
|
-
(0..(s.size - @size)).each { |i|
|
126
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
127
|
-
score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
|
128
|
-
result << [score_p, seq, false, i] if score_p >= score_g
|
129
|
-
result << [score_rc, seq_rc, true, i] if score_rc >= score_g
|
130
|
-
}
|
131
|
-
result
|
132
|
-
end
|
133
|
-
|
134
|
-
def best_hit(s, use2strands = true)
|
135
|
-
|
136
|
-
checkerr("too short sequence") { s.size < @size }
|
137
|
-
return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
|
138
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
139
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
140
|
-
r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
|
141
|
-
}
|
142
|
-
end
|
143
|
-
|
144
|
-
def eql?(pm)
|
145
|
-
return ['A','C','G','T'].inject(true) { |equal, letter|
|
146
|
-
equal = equal && @matrix[letter].eql?(pm.matrix[letter])
|
147
|
-
}
|
148
|
-
end
|
149
|
-
|
150
|
-
def flexeql?(pm)
|
151
|
-
checkerr("for what?") { true }
|
152
|
-
return ['A','C','G','T'].inject(true) { |equal, letter|
|
153
|
-
# report "letter=#{letter}"
|
154
|
-
equal = equal && (0...@size).inject(true) { |deepequal, position|
|
155
|
-
# report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
|
156
|
-
deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
|
157
|
-
}
|
158
|
-
}
|
159
|
-
end
|
160
|
-
|
161
|
-
def initialize(size, matrix = nil, words_count = nil)
|
162
|
-
checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
|
163
|
-
@size = size
|
164
|
-
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
165
|
-
if !words_count || words_count <= 0
|
166
|
-
words_count = col_sum(0)
|
167
|
-
@words_count = words_count.round >= 2 ? words_count.round : nil
|
168
|
-
else
|
169
|
-
@words_count = words_count
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def col_sum(index = 0, letset = ['A','C','G','T'])
|
174
|
-
return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
|
175
|
-
end
|
176
|
-
|
177
|
-
def PM.col_sum(matrix, index = 0)
|
178
|
-
return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
|
179
|
-
end
|
180
|
-
|
181
|
-
def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
182
|
-
@words_count = words_count if words_count && words_count > 0
|
183
|
-
|
184
|
-
@matrix.each_key do |letter|
|
185
|
-
(0...@size).each { |pos|
|
186
|
-
|
187
|
-
#p "pcm"
|
188
|
-
#p @matrix[letter][pos]
|
189
|
-
#p @matrix[letter][pos] + (probs[letter] * pseudocount)
|
190
|
-
#p ( (@words_count + pseudocount) * probs[letter])
|
191
|
-
#exit
|
192
|
-
|
193
|
-
@matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
|
194
|
-
|
195
|
-
}
|
196
|
-
end
|
197
|
-
|
198
|
-
return self
|
199
|
-
end
|
200
|
-
|
201
|
-
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
202
|
-
return self.dup.to_pwm!(words_count, probs, pseudocount)
|
203
|
-
end
|
204
|
-
alias to_pwm get_pwm
|
205
|
-
|
206
|
-
def get_ppm(words_count = nil)
|
207
|
-
words_count = @words_count unless words_count
|
208
|
-
checkerr("undefined words count") { !words_count || words_count <= 0 }
|
209
|
-
ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
210
|
-
@matrix.each_key { |letter|
|
211
|
-
(0...@size).each { |i|
|
212
|
-
ppm[letter][i] = @matrix[letter][i].to_f / words_count
|
213
|
-
}
|
214
|
-
}
|
215
|
-
return PPM.new(@size, ppm, words_count)
|
216
|
-
end
|
217
|
-
alias to_ppm get_ppm
|
218
|
-
|
219
|
-
def score(word)
|
220
|
-
checkerr("word size != pwm.size") { @size != word.size }
|
221
|
-
checkerr("word #{word} has strange characters") {
|
222
|
-
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
223
|
-
}
|
224
|
-
return (0...@size).inject(0) { |sum, i|
|
225
|
-
sum += @matrix[word[i,1]][i]
|
226
|
-
}
|
227
|
-
end
|
228
|
-
|
229
|
-
def best_score
|
230
|
-
return (0...size).inject(0) { |sum, i|
|
231
|
-
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
232
|
-
}
|
233
|
-
end
|
234
|
-
|
235
|
-
def worst_score
|
236
|
-
return (0...size).inject(0) { |sum, i|
|
237
|
-
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
238
|
-
}
|
239
|
-
end
|
240
|
-
|
241
|
-
def dup
|
242
|
-
new_matrix = {}
|
243
|
-
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
244
|
-
return PM.new(@size, new_matrix, @words_count)
|
245
|
-
end
|
246
|
-
|
247
|
-
def PM.new_pcm(words, iupacomp = false)
|
248
|
-
size = words[0].size
|
249
|
-
counts = PM.new_matrix(size)
|
250
|
-
counts.each_value { |arr| arr.fill(0) }
|
251
|
-
words.each { |word|
|
252
|
-
0.upto(size-1) { |i|
|
253
|
-
letter = word[i,1].upcase
|
254
|
-
checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
|
255
|
-
if letter != 'N'
|
256
|
-
counts[letter][i] += 1
|
257
|
-
else
|
258
|
-
['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
|
259
|
-
end
|
260
|
-
}
|
261
|
-
}
|
262
|
-
newpcm = PM.new(size, counts, words.size)
|
263
|
-
newpcm.iupacomp! if iupacomp
|
264
|
-
return newpcm
|
265
|
-
end
|
266
|
-
|
267
|
-
def PM.new_pwm(words)
|
268
|
-
pcm = PM.new_pcm(words)
|
269
|
-
pcm.to_pwm!
|
270
|
-
return pcm
|
271
|
-
end
|
272
|
-
|
273
|
-
def PM.load(filename)
|
274
|
-
# supporting pat & pwm formats (letter-column and letter-row format)
|
275
|
-
input = IO.read(filename)
|
276
|
-
tm = []
|
277
|
-
input.each_line { |line|
|
278
|
-
l_a = line.split
|
279
|
-
begin
|
280
|
-
l_a = l_a.collect { |a_i| Float(a_i) }
|
281
|
-
rescue
|
282
|
-
next
|
283
|
-
end
|
284
|
-
tm << l_a
|
285
|
-
}
|
286
|
-
tm = tm.transpose if tm.size == 4
|
287
|
-
matrix = PM.new_matrix(tm.size)
|
288
|
-
tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
|
289
|
-
|
290
|
-
ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
|
291
|
-
|
292
|
-
return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
|
293
|
-
end
|
294
|
-
|
295
|
-
def save(filename)
|
296
|
-
File.open(filename, "w") { |out_f|
|
297
|
-
case File.ext_wo_name(filename)
|
298
|
-
when "pwm"
|
299
|
-
['A', 'C', 'G', 'T'].each { |letter|
|
300
|
-
@matrix[letter].each { |e|
|
301
|
-
out_f << "#{e} "
|
302
|
-
}
|
303
|
-
out_f << $/
|
304
|
-
}
|
305
|
-
when "pat"
|
306
|
-
out_f.puts File.name_wo_ext(filename)
|
307
|
-
(0...@size).each { |i|
|
308
|
-
['A', 'C', 'G', 'T'].each { |letter|
|
309
|
-
out_f << "#{@matrix[letter][i]} "
|
310
|
-
}
|
311
|
-
out_f << $/
|
312
|
-
}
|
313
|
-
when "xml"
|
314
|
-
checkerr("small-BiSMark is not supported at this moment")
|
315
|
-
else
|
316
|
-
checkerr("unknown motif file format specified")
|
317
|
-
end
|
318
|
-
}
|
319
|
-
end
|
320
|
-
|
321
|
-
def positiv!
|
322
|
-
min = @matrix.values.collect { |v| v.min }.min.abs
|
323
|
-
@matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
|
324
|
-
return self
|
325
|
-
end
|
326
|
-
|
327
|
-
def revcomp!
|
328
|
-
@matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
|
329
|
-
@matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
|
330
|
-
@matrix.each_value { |v| v.reverse! }
|
331
|
-
self
|
332
|
-
end
|
333
|
-
|
334
|
-
def to_bismark(b)
|
335
|
-
pwm = @matrix['A'][0].is_a?(Float)
|
336
|
-
attributes = {"length" => @size}
|
337
|
-
attributes["words-count"] = @words_count if @words_count && @words_count > 0
|
338
|
-
pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
|
339
|
-
each_position_index do |i|
|
340
|
-
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
341
|
-
['A', 'C', 'G', 'T'].each { |l|
|
342
|
-
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
343
|
-
}
|
344
|
-
end
|
345
|
-
end
|
346
|
-
|
347
|
-
def PM.from_bismark(b, iupacomp = false)
|
348
|
-
|
349
|
-
checkerr("empty small-BiSMark file?") { !b }
|
350
|
-
float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
|
351
|
-
words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
|
352
|
-
|
353
|
-
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
354
|
-
b.elements.each("pm-column") { |pmc|
|
355
|
-
position = pmc.attributes["position"].to_i
|
356
|
-
['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
|
357
|
-
}
|
358
|
-
if b.name == "PPM"
|
359
|
-
newppm = PPM.new(matrix['A'].size, matrix, words_count)
|
360
|
-
newppm.iupacomp! if iupacomp
|
361
|
-
return newppm
|
362
|
-
end
|
363
|
-
if b.name == "PCM"
|
364
|
-
@words_count = col_sum(matrix)
|
365
|
-
newpcm = PM.new(matrix['A'].size, matrix, words_count)
|
366
|
-
newpcm.iupacomp! if iupacomp
|
367
|
-
return newpcm
|
368
|
-
end
|
369
|
-
if b.name == "PWM" && iupacomp
|
370
|
-
raise "cannot force IUPAC compatible PWM"
|
371
|
-
end
|
372
|
-
return PM.new(matrix['A'].size, matrix, words_count)
|
373
|
-
end
|
374
|
-
|
375
|
-
IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
|
376
|
-
def iupacomp!
|
377
|
-
@words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
|
378
|
-
# @words_count = @words_count.round < 2.0 ? nil : @words_count.round
|
379
|
-
|
380
|
-
IUPAC_LS.each { |iul_ls|
|
381
|
-
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
382
|
-
}
|
383
|
-
|
384
|
-
return self
|
385
|
-
end
|
386
|
-
|
387
|
-
def m3sd(bckgr = Randoom::DEF_PROBS)
|
388
|
-
|
389
|
-
mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
390
|
-
dev = (0...@size).inject(0.0) { |m2, i|
|
391
|
-
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
392
|
-
m2 += deltai
|
393
|
-
}
|
394
|
-
sigma = Math.sqrt(dev)
|
395
|
-
|
396
|
-
mean+3*sigma
|
397
|
-
end
|
398
|
-
|
399
|
-
def fixwc
|
400
|
-
return unless @words_count
|
401
|
-
@words_count = (0...@size).collect { |i| col_sum(i) }.max
|
402
|
-
end
|
403
|
-
|
404
|
-
protected
|
405
|
-
def PM.new_matrix(size)
|
406
|
-
return {
|
407
|
-
'A' => Array.new(size),
|
408
|
-
'C' => Array.new(size),
|
409
|
-
'G' => Array.new(size),
|
410
|
-
'T' => Array.new(size) }
|
411
|
-
end
|
412
|
-
|
413
|
-
def PM.new_matrix_iupac(size)
|
414
|
-
return {
|
415
|
-
'A' => Array.new(size),
|
416
|
-
'C' => Array.new(size),
|
417
|
-
'G' => Array.new(size),
|
418
|
-
'T' => Array.new(size),
|
419
|
-
'R' => Array.new(size),
|
420
|
-
'Y' => Array.new(size),
|
421
|
-
'K' => Array.new(size),
|
422
|
-
'M' => Array.new(size),
|
423
|
-
'S' => Array.new(size),
|
424
|
-
'W' => Array.new(size),
|
425
|
-
'B' => Array.new(size),
|
426
|
-
'D' => Array.new(size),
|
427
|
-
'H' => Array.new(size),
|
428
|
-
'V' => Array.new(size),
|
429
|
-
'N' => Array.new(size)
|
430
|
-
}
|
431
|
-
end
|
432
|
-
|
433
|
-
end
|
434
|
-
|
435
|
-
class PPM < PM
|
436
|
-
|
437
|
-
#DEPRECATED, use iupacomp! instead
|
438
|
-
#def make_N_comp!
|
439
|
-
# @matrix['N'] = (0...size).collect { 0.25 }
|
440
|
-
# return self
|
441
|
-
#end
|
442
|
-
|
443
|
-
def initialize(size, matrix = nil, words_count = nil)
|
444
|
-
checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
|
445
|
-
@size = size
|
446
|
-
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
447
|
-
@words_count = words_count
|
448
|
-
end
|
449
|
-
|
450
|
-
def iupacomp!
|
451
|
-
@words_count = 4.0 unless @words_count
|
452
|
-
|
453
|
-
IUPAC_LS.each { |iul_ls|
|
454
|
-
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
455
|
-
}
|
456
|
-
|
457
|
-
return self
|
458
|
-
end
|
459
|
-
|
460
|
-
def score(word)
|
461
|
-
checkerr("word size != ppm.size") { @size != word.size }
|
462
|
-
checkerr("word #{word} has strange characters") {
|
463
|
-
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
464
|
-
}
|
465
|
-
return (0...@size).inject(1) { |mul, i|
|
466
|
-
mul *= @matrix[word[i,1]][i]
|
467
|
-
}
|
468
|
-
end
|
469
|
-
|
470
|
-
def best_score
|
471
|
-
return (0...size).inject(1) { |mul, i|
|
472
|
-
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
473
|
-
}
|
474
|
-
end
|
475
|
-
|
476
|
-
def worst_score
|
477
|
-
return (0...size).inject(0) { |mul, i|
|
478
|
-
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
479
|
-
}
|
480
|
-
end
|
481
|
-
|
482
|
-
def to_bismark(b)
|
483
|
-
attributes = {"length" => @size}
|
484
|
-
attributes["words-count"] = @words_count if @words_count
|
485
|
-
pe = b.add_element("PPM", attributes)
|
486
|
-
(0...@matrix['A'].size).each { |i|
|
487
|
-
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
488
|
-
['A', 'C', 'G', 'T'].each { |l|
|
489
|
-
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
490
|
-
}
|
491
|
-
}
|
492
|
-
end
|
493
|
-
|
494
|
-
def PPM.probs2IUPAC!(probs)
|
495
|
-
IUPAC_LS.each { |iul_ls|
|
496
|
-
probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
|
497
|
-
}
|
498
|
-
return probs
|
499
|
-
end
|
500
|
-
|
501
|
-
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
|
502
|
-
|
503
|
-
probs = PPM.probs2IUPAC!(probs.dup)
|
504
|
-
|
505
|
-
words_count = @words_count if !words_count || words_count == 0
|
506
|
-
checkerr("undefined words count") { !words_count }
|
507
|
-
|
508
|
-
pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
509
|
-
|
510
|
-
@matrix.each_key do |letter|
|
511
|
-
(0...@size).each { |pos|
|
512
|
-
|
513
|
-
pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
|
514
|
-
|
515
|
-
}
|
516
|
-
end
|
517
|
-
return PM.new(@size, pwm, words_count)
|
518
|
-
#pcm = get_pcm(words_count)
|
519
|
-
#pcm.iupacomp! if @matrix['N']
|
520
|
-
#return pcm.to_pwm!(words_count, probs, pseudocount)
|
521
|
-
end
|
522
|
-
alias to_pwm get_pwm
|
523
|
-
|
524
|
-
def get_pwm0pc(probs = Randoom::DEF_PROBS)
|
525
|
-
new_matrix = {}
|
526
|
-
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
527
|
-
newpm = PM.new(@size, new_matrix, nil)
|
528
|
-
|
529
|
-
new_matrix.each_key do |letter|
|
530
|
-
(0...@size).each { |pos|
|
531
|
-
new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
|
532
|
-
}
|
533
|
-
end
|
534
|
-
|
535
|
-
return newpm
|
536
|
-
end
|
537
|
-
|
538
|
-
def to_pwm!
|
539
|
-
raise "cannot force PPM class to PWM, use to_pwm instead"
|
540
|
-
end
|
541
|
-
|
542
|
-
def get_pcm(words_count = nil)
|
543
|
-
words_count = @words_count unless words_count
|
544
|
-
checkerr("undefined words count") { !words_count }
|
545
|
-
counts = PM.new_matrix(@size)
|
546
|
-
(0...size).each { |i|
|
547
|
-
['A', 'C', 'G', 'T'].each { |l|
|
548
|
-
counts[l][i] = @matrix[l][i] * words_count
|
549
|
-
}
|
550
|
-
}
|
551
|
-
newpcm = PM.new(size, counts, words_count).iupacomp!
|
552
|
-
return newpcm
|
553
|
-
end
|
554
|
-
alias to_pcm get_pcm
|
555
|
-
|
556
|
-
def PPM.from_IUPAC(iupac)
|
557
|
-
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
558
|
-
|
559
|
-
(0...iupac.size).each { |i|
|
560
|
-
matrix.each_key { |k| matrix[k] << 0.0 }
|
561
|
-
letters = IUPAC::REVCODE[iupac[i]]
|
562
|
-
(0...letters.size).each { |j|
|
563
|
-
matrix[letters[j]][-1] = 1.0/letters.size
|
564
|
-
}
|
565
|
-
}
|
566
|
-
|
567
|
-
newppm = PPM.new(iupac.size, matrix, 4.0)
|
568
|
-
newppm.iupacomp!
|
569
|
-
|
570
|
-
newppm
|
571
|
-
end
|
572
|
-
|
573
|
-
end
|
574
|
-
end
|