sequence_logo 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +21 -21
- data/README.md +65 -65
- data/Rakefile +5 -5
- data/TODO.txt +7 -7
- data/bin/glue_logos +2 -2
- data/bin/sequence_logo +2 -2
- data/lib/sequence_logo/cli.rb +36 -36
- data/lib/sequence_logo/exec/glue_logos.rb +97 -66
- data/lib/sequence_logo/exec/sequence_logo.rb +51 -51
- data/lib/sequence_logo/pmflogo_lib.rb +113 -113
- data/lib/sequence_logo/version.rb +3 -3
- data/lib/sequence_logo/ytilib/addon.rb +246 -246
- data/lib/sequence_logo/ytilib/bismark.rb +70 -70
- data/lib/sequence_logo/ytilib/hack1.rb +75 -75
- data/lib/sequence_logo/ytilib/infocod.rb +108 -108
- data/lib/sequence_logo/ytilib/iupac.rb +92 -92
- data/lib/sequence_logo/ytilib/pm.rb +562 -562
- data/lib/sequence_logo/ytilib/pmsd.rb +98 -98
- data/lib/sequence_logo/ytilib/ppm_support.rb +85 -85
- data/lib/sequence_logo/ytilib/randoom.rb +131 -131
- data/lib/sequence_logo/ytilib/ytilib.rb +146 -146
- data/lib/sequence_logo/ytilib.rb +9 -9
- data/lib/sequence_logo.rb +7 -7
- data/sequence_logo.gemspec +21 -21
- data/test/data/pcm/AHR_si.pcm +10 -10
- data/test/data/pcm/AIRE_f2.pcm +19 -19
- metadata +3 -4
@@ -1,562 +1,562 @@
|
|
1
|
-
module Ytilib
|
2
|
-
class PM
|
3
|
-
|
4
|
-
attr_reader :matrix, :size
|
5
|
-
attr_accessor :words_count
|
6
|
-
|
7
|
-
alias length size
|
8
|
-
|
9
|
-
def score_mean(bckgr = Randoom::DEF_PROBS)
|
10
|
-
(0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
11
|
-
end
|
12
|
-
|
13
|
-
def score_variance(bckgr = Randoom::DEF_PROBS)
|
14
|
-
(0...@size).inject(0.0) { |m2, i|
|
15
|
-
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
16
|
-
m2 += deltai
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
def p_value(threshold, mean = nil, variance = nil)
|
21
|
-
mean = mean ? mean : score_mean
|
22
|
-
variance = variance ? variance : score_variance
|
23
|
-
n_ = (threshold - mean) / Math.sqrt(variance)
|
24
|
-
p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
|
25
|
-
end
|
26
|
-
|
27
|
-
def best_word
|
28
|
-
return (0...size).inject("") { |word, i|
|
29
|
-
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
30
|
-
maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
|
31
|
-
word << (maxlets.size == 1 ? maxlets.first : "N")
|
32
|
-
}
|
33
|
-
end
|
34
|
-
|
35
|
-
def strict_consensus
|
36
|
-
return IUPAC.new((0...size).inject("") { |word, i|
|
37
|
-
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
38
|
-
maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
|
39
|
-
word += IUPAC::CODE[maxlets]
|
40
|
-
})
|
41
|
-
end
|
42
|
-
|
43
|
-
def consensus_string(beautiful = false)
|
44
|
-
checkerr("words count is undefined") { !@words_count }
|
45
|
-
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
46
|
-
icd = infocod
|
47
|
-
|
48
|
-
return String.new((0...size).inject("") { |word, i|
|
49
|
-
|
50
|
-
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
51
|
-
|
52
|
-
if icd[i] > i2o4
|
53
|
-
scores = [scores.first]
|
54
|
-
elsif icd[i] > thc
|
55
|
-
scores = scores[0..1]
|
56
|
-
elsif icd[i] > tlc
|
57
|
-
scores = scores[0..2]
|
58
|
-
end
|
59
|
-
|
60
|
-
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
61
|
-
|
62
|
-
reslet = IUPAC::CODE[lets]
|
63
|
-
reslet = reslet.downcase if beautiful && lets.size > 2
|
64
|
-
|
65
|
-
word += reslet
|
66
|
-
})
|
67
|
-
end
|
68
|
-
|
69
|
-
def consensus
|
70
|
-
checkerr("words count is undefined") { !@words_count }
|
71
|
-
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
72
|
-
icd = infocod
|
73
|
-
|
74
|
-
return IUPAC.new((0...size).inject("") { |word, i|
|
75
|
-
|
76
|
-
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
77
|
-
|
78
|
-
if icd[i] > i2o4
|
79
|
-
scores = [scores.first]
|
80
|
-
elsif icd[i] > thc
|
81
|
-
scores = scores[0..1]
|
82
|
-
elsif icd[i] > tlc
|
83
|
-
scores = scores[0..2]
|
84
|
-
end
|
85
|
-
|
86
|
-
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
87
|
-
|
88
|
-
word += IUPAC::CODE[lets]
|
89
|
-
})
|
90
|
-
end
|
91
|
-
|
92
|
-
def find_hit(s, score_g, use2strands = true)
|
93
|
-
(0..(s.size - @size)).each { |i|
|
94
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
95
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
96
|
-
r = use2strands ? [score_p,score_rc].max : score_p
|
97
|
-
return i if r >= score_g
|
98
|
-
}
|
99
|
-
return nil
|
100
|
-
end
|
101
|
-
|
102
|
-
def find_hits(s, score_g, use2strands = true)
|
103
|
-
(0..(s.size - @size)).select { |i|
|
104
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
105
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
106
|
-
r = use2strands ? [score_p,score_rc].max : score_p
|
107
|
-
r >= score_g ? i : nil
|
108
|
-
}.compact
|
109
|
-
end
|
110
|
-
|
111
|
-
def collect_hits(s, score_g, use2strands = true)
|
112
|
-
result = []
|
113
|
-
(0..(s.size - @size)).each { |i|
|
114
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
115
|
-
score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
|
116
|
-
result << [score_p, seq, false, i] if score_p >= score_g
|
117
|
-
result << [score_rc, seq_rc, true, i] if score_rc >= score_g
|
118
|
-
}
|
119
|
-
result
|
120
|
-
end
|
121
|
-
|
122
|
-
def best_hit(s, use2strands = true)
|
123
|
-
|
124
|
-
checkerr("too short sequence") { s.size < @size }
|
125
|
-
return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
|
126
|
-
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
127
|
-
score_p, score_rc = score(seq), score(seq_rc)
|
128
|
-
r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
|
129
|
-
}
|
130
|
-
end
|
131
|
-
|
132
|
-
def eql?(pm)
|
133
|
-
return ['A','C','G','T'].inject(true) { |equal, letter|
|
134
|
-
equal = equal && @matrix[letter].eql?(pm.matrix[letter])
|
135
|
-
}
|
136
|
-
end
|
137
|
-
|
138
|
-
def flexeql?(pm)
|
139
|
-
checkerr("for what?") { true }
|
140
|
-
return ['A','C','G','T'].inject(true) { |equal, letter|
|
141
|
-
# report "letter=#{letter}"
|
142
|
-
equal = equal && (0...@size).inject(true) { |deepequal, position|
|
143
|
-
# report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
|
144
|
-
deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
|
145
|
-
}
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
149
|
-
def initialize(size, matrix = nil, words_count = nil)
|
150
|
-
checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
|
151
|
-
@size = size
|
152
|
-
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
153
|
-
if !words_count || words_count <= 0
|
154
|
-
words_count = col_sum(0)
|
155
|
-
@words_count = words_count.round >= 2 ? words_count.round : nil
|
156
|
-
else
|
157
|
-
@words_count = words_count
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
def col_sum(index = 0, letset = ['A','C','G','T'])
|
162
|
-
return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
|
163
|
-
end
|
164
|
-
|
165
|
-
def PM.col_sum(matrix, index = 0)
|
166
|
-
return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
|
167
|
-
end
|
168
|
-
|
169
|
-
def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
170
|
-
@words_count = words_count if words_count && words_count > 0
|
171
|
-
|
172
|
-
@matrix.each_key do |letter|
|
173
|
-
(0...@size).each { |pos|
|
174
|
-
|
175
|
-
#p "pcm"
|
176
|
-
#p @matrix[letter][pos]
|
177
|
-
#p @matrix[letter][pos] + (probs[letter] * pseudocount)
|
178
|
-
#p ( (@words_count + pseudocount) * probs[letter])
|
179
|
-
#exit
|
180
|
-
|
181
|
-
@matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
|
182
|
-
|
183
|
-
}
|
184
|
-
end
|
185
|
-
|
186
|
-
return self
|
187
|
-
end
|
188
|
-
|
189
|
-
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
190
|
-
return self.dup.to_pwm!(words_count, probs, pseudocount)
|
191
|
-
end
|
192
|
-
alias to_pwm get_pwm
|
193
|
-
|
194
|
-
def get_ppm(words_count = nil)
|
195
|
-
words_count = @words_count unless words_count
|
196
|
-
checkerr("undefined words count") { !words_count || words_count <= 0 }
|
197
|
-
ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
198
|
-
@matrix.each_key { |letter|
|
199
|
-
(0...@size).each { |i|
|
200
|
-
ppm[letter][i] = @matrix[letter][i].to_f / words_count
|
201
|
-
}
|
202
|
-
}
|
203
|
-
return PPM.new(@size, ppm, words_count)
|
204
|
-
end
|
205
|
-
alias to_ppm get_ppm
|
206
|
-
|
207
|
-
def score(word)
|
208
|
-
checkerr("word size != pwm.size") { @size != word.size }
|
209
|
-
checkerr("word #{word} has strange characters") {
|
210
|
-
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
211
|
-
}
|
212
|
-
return (0...@size).inject(0) { |sum, i|
|
213
|
-
sum += @matrix[word[i,1]][i]
|
214
|
-
}
|
215
|
-
end
|
216
|
-
|
217
|
-
def best_score
|
218
|
-
return (0...size).inject(0) { |sum, i|
|
219
|
-
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
220
|
-
}
|
221
|
-
end
|
222
|
-
|
223
|
-
def worst_score
|
224
|
-
return (0...size).inject(0) { |sum, i|
|
225
|
-
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
226
|
-
}
|
227
|
-
end
|
228
|
-
|
229
|
-
def dup
|
230
|
-
new_matrix = {}
|
231
|
-
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
232
|
-
return PM.new(@size, new_matrix, @words_count)
|
233
|
-
end
|
234
|
-
|
235
|
-
def PM.new_pcm(words, iupacomp = false)
|
236
|
-
size = words[0].size
|
237
|
-
counts = PM.new_matrix(size)
|
238
|
-
counts.each_value { |arr| arr.fill(0) }
|
239
|
-
words.each { |word|
|
240
|
-
0.upto(size-1) { |i|
|
241
|
-
letter = word[i,1].upcase
|
242
|
-
checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
|
243
|
-
if letter != 'N'
|
244
|
-
counts[letter][i] += 1
|
245
|
-
else
|
246
|
-
['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
|
247
|
-
end
|
248
|
-
}
|
249
|
-
}
|
250
|
-
newpcm = PM.new(size, counts, words.size)
|
251
|
-
newpcm.iupacomp! if iupacomp
|
252
|
-
return newpcm
|
253
|
-
end
|
254
|
-
|
255
|
-
def PM.new_pwm(words)
|
256
|
-
pcm = PM.new_pcm(words)
|
257
|
-
pcm.to_pwm!
|
258
|
-
return pcm
|
259
|
-
end
|
260
|
-
|
261
|
-
def PM.load(filename)
|
262
|
-
# supporting pat & pwm formats (letter-column and letter-row format)
|
263
|
-
input = IO.read(filename)
|
264
|
-
tm = []
|
265
|
-
input.each_line { |line|
|
266
|
-
l_a = line.split
|
267
|
-
begin
|
268
|
-
l_a = l_a.collect { |a_i| Float(a_i) }
|
269
|
-
rescue
|
270
|
-
next
|
271
|
-
end
|
272
|
-
tm << l_a
|
273
|
-
}
|
274
|
-
tm = tm.transpose if tm.size == 4
|
275
|
-
matrix = PM.new_matrix(tm.size)
|
276
|
-
tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
|
277
|
-
|
278
|
-
ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
|
279
|
-
|
280
|
-
return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
|
281
|
-
end
|
282
|
-
|
283
|
-
def save(filename)
|
284
|
-
File.open(filename, "w") { |out_f|
|
285
|
-
case File.ext_wo_name(filename)
|
286
|
-
when "pwm"
|
287
|
-
['A', 'C', 'G', 'T'].each { |letter|
|
288
|
-
@matrix[letter].each { |e|
|
289
|
-
out_f << "#{e} "
|
290
|
-
}
|
291
|
-
out_f << $/
|
292
|
-
}
|
293
|
-
when "pat"
|
294
|
-
out_f.puts File.name_wo_ext(filename)
|
295
|
-
(0...@size).each { |i|
|
296
|
-
['A', 'C', 'G', 'T'].each { |letter|
|
297
|
-
out_f << "#{@matrix[letter][i]} "
|
298
|
-
}
|
299
|
-
out_f << $/
|
300
|
-
}
|
301
|
-
when "xml"
|
302
|
-
checkerr("small-BiSMark is not supported at this moment")
|
303
|
-
else
|
304
|
-
checkerr("unknown motif file format specified")
|
305
|
-
end
|
306
|
-
}
|
307
|
-
end
|
308
|
-
|
309
|
-
def positiv!
|
310
|
-
min = @matrix.values.collect { |v| v.min }.min.abs
|
311
|
-
@matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
|
312
|
-
return self
|
313
|
-
end
|
314
|
-
|
315
|
-
def revcomp!
|
316
|
-
@matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
|
317
|
-
@matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
|
318
|
-
@matrix.each_value { |v| v.reverse! }
|
319
|
-
self
|
320
|
-
end
|
321
|
-
|
322
|
-
def to_bismark(b)
|
323
|
-
pwm = @matrix['A'][0].is_a?(Float)
|
324
|
-
attributes = {"length" => @size}
|
325
|
-
attributes["words-count"] = @words_count if @words_count && @words_count > 0
|
326
|
-
pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
|
327
|
-
(0...@matrix['A'].size).each { |i|
|
328
|
-
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
329
|
-
['A', 'C', 'G', 'T'].each { |l|
|
330
|
-
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
331
|
-
}
|
332
|
-
}
|
333
|
-
end
|
334
|
-
|
335
|
-
def PM.from_bismark(b, iupacomp = false)
|
336
|
-
|
337
|
-
checkerr("empty small-BiSMark file?") { !b }
|
338
|
-
float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
|
339
|
-
words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
|
340
|
-
|
341
|
-
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
342
|
-
b.elements.each("pm-column") { |pmc|
|
343
|
-
position = pmc.attributes["position"].to_i
|
344
|
-
['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
|
345
|
-
}
|
346
|
-
if b.name == "PPM"
|
347
|
-
newppm = PPM.new(matrix['A'].size, matrix, words_count)
|
348
|
-
newppm.iupacomp! if iupacomp
|
349
|
-
return newppm
|
350
|
-
end
|
351
|
-
if b.name == "PCM"
|
352
|
-
@words_count = col_sum(matrix)
|
353
|
-
newpcm = PM.new(matrix['A'].size, matrix, words_count)
|
354
|
-
newpcm.iupacomp! if iupacomp
|
355
|
-
return newpcm
|
356
|
-
end
|
357
|
-
if b.name == "PWM" && iupacomp
|
358
|
-
raise "cannot force IUPAC compatible PWM"
|
359
|
-
end
|
360
|
-
return PM.new(matrix['A'].size, matrix, words_count)
|
361
|
-
end
|
362
|
-
|
363
|
-
IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
|
364
|
-
def iupacomp!
|
365
|
-
@words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
|
366
|
-
# @words_count = @words_count.round < 2.0 ? nil : @words_count.round
|
367
|
-
|
368
|
-
IUPAC_LS.each { |iul_ls|
|
369
|
-
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
370
|
-
}
|
371
|
-
|
372
|
-
return self
|
373
|
-
end
|
374
|
-
|
375
|
-
def m3sd(bckgr = Randoom::DEF_PROBS)
|
376
|
-
|
377
|
-
mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
378
|
-
dev = (0...@size).inject(0.0) { |m2, i|
|
379
|
-
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
380
|
-
m2 += deltai
|
381
|
-
}
|
382
|
-
sigma = Math.sqrt(dev)
|
383
|
-
|
384
|
-
mean+3*sigma
|
385
|
-
end
|
386
|
-
|
387
|
-
def fixwc
|
388
|
-
return unless @words_count
|
389
|
-
@words_count = (0...@size).collect { |i| col_sum(i) }.max
|
390
|
-
end
|
391
|
-
|
392
|
-
protected
|
393
|
-
def PM.new_matrix(size)
|
394
|
-
return {
|
395
|
-
'A' => Array.new(size),
|
396
|
-
'C' => Array.new(size),
|
397
|
-
'G' => Array.new(size),
|
398
|
-
'T' => Array.new(size) }
|
399
|
-
end
|
400
|
-
|
401
|
-
def PM.new_matrix_iupac(size)
|
402
|
-
return {
|
403
|
-
'A' => Array.new(size),
|
404
|
-
'C' => Array.new(size),
|
405
|
-
'G' => Array.new(size),
|
406
|
-
'T' => Array.new(size),
|
407
|
-
'R' => Array.new(size),
|
408
|
-
'Y' => Array.new(size),
|
409
|
-
'K' => Array.new(size),
|
410
|
-
'M' => Array.new(size),
|
411
|
-
'S' => Array.new(size),
|
412
|
-
'W' => Array.new(size),
|
413
|
-
'B' => Array.new(size),
|
414
|
-
'D' => Array.new(size),
|
415
|
-
'H' => Array.new(size),
|
416
|
-
'V' => Array.new(size),
|
417
|
-
'N' => Array.new(size)
|
418
|
-
}
|
419
|
-
end
|
420
|
-
|
421
|
-
end
|
422
|
-
|
423
|
-
class PPM < PM
|
424
|
-
|
425
|
-
#DEPRECATED, use iupacomp! instead
|
426
|
-
#def make_N_comp!
|
427
|
-
# @matrix['N'] = (0...size).collect { 0.25 }
|
428
|
-
# return self
|
429
|
-
#end
|
430
|
-
|
431
|
-
def initialize(size, matrix = nil, words_count = nil)
|
432
|
-
checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
|
433
|
-
@size = size
|
434
|
-
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
435
|
-
@words_count = words_count
|
436
|
-
end
|
437
|
-
|
438
|
-
def iupacomp!
|
439
|
-
@words_count = 4.0 unless @words_count
|
440
|
-
|
441
|
-
IUPAC_LS.each { |iul_ls|
|
442
|
-
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
443
|
-
}
|
444
|
-
|
445
|
-
return self
|
446
|
-
end
|
447
|
-
|
448
|
-
def score(word)
|
449
|
-
checkerr("word size != ppm.size") { @size != word.size }
|
450
|
-
checkerr("word #{word} has strange characters") {
|
451
|
-
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
452
|
-
}
|
453
|
-
return (0...@size).inject(1) { |mul, i|
|
454
|
-
mul *= @matrix[word[i,1]][i]
|
455
|
-
}
|
456
|
-
end
|
457
|
-
|
458
|
-
def best_score
|
459
|
-
return (0...size).inject(1) { |mul, i|
|
460
|
-
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
461
|
-
}
|
462
|
-
end
|
463
|
-
|
464
|
-
def worst_score
|
465
|
-
return (0...size).inject(0) { |mul, i|
|
466
|
-
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
467
|
-
}
|
468
|
-
end
|
469
|
-
|
470
|
-
def to_bismark(b)
|
471
|
-
attributes = {"length" => @size}
|
472
|
-
attributes["words-count"] = @words_count if @words_count
|
473
|
-
pe = b.add_element("PPM", attributes)
|
474
|
-
(0...@matrix['A'].size).each { |i|
|
475
|
-
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
476
|
-
['A', 'C', 'G', 'T'].each { |l|
|
477
|
-
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
478
|
-
}
|
479
|
-
}
|
480
|
-
end
|
481
|
-
|
482
|
-
def PPM.probs2IUPAC!(probs)
|
483
|
-
IUPAC_LS.each { |iul_ls|
|
484
|
-
probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
|
485
|
-
}
|
486
|
-
return probs
|
487
|
-
end
|
488
|
-
|
489
|
-
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
|
490
|
-
|
491
|
-
probs = PPM.probs2IUPAC!(probs.dup)
|
492
|
-
|
493
|
-
words_count = @words_count if !words_count || words_count == 0
|
494
|
-
checkerr("undefined words count") { !words_count }
|
495
|
-
|
496
|
-
pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
497
|
-
|
498
|
-
@matrix.each_key do |letter|
|
499
|
-
(0...@size).each { |pos|
|
500
|
-
|
501
|
-
pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
|
502
|
-
|
503
|
-
}
|
504
|
-
end
|
505
|
-
return PM.new(@size, pwm, words_count)
|
506
|
-
#pcm = get_pcm(words_count)
|
507
|
-
#pcm.iupacomp! if @matrix['N']
|
508
|
-
#return pcm.to_pwm!(words_count, probs, pseudocount)
|
509
|
-
end
|
510
|
-
alias to_pwm get_pwm
|
511
|
-
|
512
|
-
def get_pwm0pc(probs = Randoom::DEF_PROBS)
|
513
|
-
new_matrix = {}
|
514
|
-
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
515
|
-
newpm = PM.new(@size, new_matrix, nil)
|
516
|
-
|
517
|
-
new_matrix.each_key do |letter|
|
518
|
-
(0...@size).each { |pos|
|
519
|
-
new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
|
520
|
-
}
|
521
|
-
end
|
522
|
-
|
523
|
-
return newpm
|
524
|
-
end
|
525
|
-
|
526
|
-
def to_pwm!
|
527
|
-
raise "cannot force PPM class to PWM, use to_pwm instead"
|
528
|
-
end
|
529
|
-
|
530
|
-
def get_pcm(words_count = nil)
|
531
|
-
words_count = @words_count unless words_count
|
532
|
-
checkerr("undefined words count") { !words_count }
|
533
|
-
counts = PM.new_matrix(@size)
|
534
|
-
(0...size).each { |i|
|
535
|
-
['A', 'C', 'G', 'T'].each { |l|
|
536
|
-
counts[l][i] = @matrix[l][i] * words_count
|
537
|
-
}
|
538
|
-
}
|
539
|
-
newpcm = PM.new(size, counts, words_count).iupacomp!
|
540
|
-
return newpcm
|
541
|
-
end
|
542
|
-
alias to_pcm get_pcm
|
543
|
-
|
544
|
-
def PPM.from_IUPAC(iupac)
|
545
|
-
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
546
|
-
|
547
|
-
(0...iupac.size).each { |i|
|
548
|
-
matrix.each_key { |k| matrix[k] << 0.0 }
|
549
|
-
letters = IUPAC::REVCODE[iupac[i]]
|
550
|
-
(0...letters.size).each { |j|
|
551
|
-
matrix[letters[j]][-1] = 1.0/letters.size
|
552
|
-
}
|
553
|
-
}
|
554
|
-
|
555
|
-
newppm = PPM.new(iupac.size, matrix, 4.0)
|
556
|
-
newppm.iupacomp!
|
557
|
-
|
558
|
-
newppm
|
559
|
-
end
|
560
|
-
|
561
|
-
end
|
562
|
-
end
|
1
|
+
module Ytilib
|
2
|
+
class PM
|
3
|
+
|
4
|
+
attr_reader :matrix, :size
|
5
|
+
attr_accessor :words_count
|
6
|
+
|
7
|
+
alias length size
|
8
|
+
|
9
|
+
def score_mean(bckgr = Randoom::DEF_PROBS)
|
10
|
+
(0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
11
|
+
end
|
12
|
+
|
13
|
+
def score_variance(bckgr = Randoom::DEF_PROBS)
|
14
|
+
(0...@size).inject(0.0) { |m2, i|
|
15
|
+
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
16
|
+
m2 += deltai
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
def p_value(threshold, mean = nil, variance = nil)
|
21
|
+
mean = mean ? mean : score_mean
|
22
|
+
variance = variance ? variance : score_variance
|
23
|
+
n_ = (threshold - mean) / Math.sqrt(variance)
|
24
|
+
p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
|
25
|
+
end
|
26
|
+
|
27
|
+
def best_word
|
28
|
+
return (0...size).inject("") { |word, i|
|
29
|
+
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
30
|
+
maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
|
31
|
+
word << (maxlets.size == 1 ? maxlets.first : "N")
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def strict_consensus
|
36
|
+
return IUPAC.new((0...size).inject("") { |word, i|
|
37
|
+
max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
38
|
+
maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
|
39
|
+
word += IUPAC::CODE[maxlets]
|
40
|
+
})
|
41
|
+
end
|
42
|
+
|
43
|
+
def consensus_string(beautiful = false)
|
44
|
+
checkerr("words count is undefined") { !@words_count }
|
45
|
+
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
46
|
+
icd = infocod
|
47
|
+
|
48
|
+
return String.new((0...size).inject("") { |word, i|
|
49
|
+
|
50
|
+
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
51
|
+
|
52
|
+
if icd[i] > i2o4
|
53
|
+
scores = [scores.first]
|
54
|
+
elsif icd[i] > thc
|
55
|
+
scores = scores[0..1]
|
56
|
+
elsif icd[i] > tlc
|
57
|
+
scores = scores[0..2]
|
58
|
+
end
|
59
|
+
|
60
|
+
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
61
|
+
|
62
|
+
reslet = IUPAC::CODE[lets]
|
63
|
+
reslet = reslet.downcase if beautiful && lets.size > 2
|
64
|
+
|
65
|
+
word += reslet
|
66
|
+
})
|
67
|
+
end
|
68
|
+
|
69
|
+
def consensus
|
70
|
+
checkerr("words count is undefined") { !@words_count }
|
71
|
+
i2o4, thc, tlc = icd2of4, icdThc, icdTlc
|
72
|
+
icd = infocod
|
73
|
+
|
74
|
+
return IUPAC.new((0...size).inject("") { |word, i|
|
75
|
+
|
76
|
+
scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
|
77
|
+
|
78
|
+
if icd[i] > i2o4
|
79
|
+
scores = [scores.first]
|
80
|
+
elsif icd[i] > thc
|
81
|
+
scores = scores[0..1]
|
82
|
+
elsif icd[i] > tlc
|
83
|
+
scores = scores[0..2]
|
84
|
+
end
|
85
|
+
|
86
|
+
lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
|
87
|
+
|
88
|
+
word += IUPAC::CODE[lets]
|
89
|
+
})
|
90
|
+
end
|
91
|
+
|
92
|
+
def find_hit(s, score_g, use2strands = true)
|
93
|
+
(0..(s.size - @size)).each { |i|
|
94
|
+
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
95
|
+
score_p, score_rc = score(seq), score(seq_rc)
|
96
|
+
r = use2strands ? [score_p,score_rc].max : score_p
|
97
|
+
return i if r >= score_g
|
98
|
+
}
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
|
102
|
+
def find_hits(s, score_g, use2strands = true)
|
103
|
+
(0..(s.size - @size)).select { |i|
|
104
|
+
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
105
|
+
score_p, score_rc = score(seq), score(seq_rc)
|
106
|
+
r = use2strands ? [score_p,score_rc].max : score_p
|
107
|
+
r >= score_g ? i : nil
|
108
|
+
}.compact
|
109
|
+
end
|
110
|
+
|
111
|
+
def collect_hits(s, score_g, use2strands = true)
|
112
|
+
result = []
|
113
|
+
(0..(s.size - @size)).each { |i|
|
114
|
+
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
115
|
+
score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
|
116
|
+
result << [score_p, seq, false, i] if score_p >= score_g
|
117
|
+
result << [score_rc, seq_rc, true, i] if score_rc >= score_g
|
118
|
+
}
|
119
|
+
result
|
120
|
+
end
|
121
|
+
|
122
|
+
def best_hit(s, use2strands = true)
|
123
|
+
|
124
|
+
checkerr("too short sequence") { s.size < @size }
|
125
|
+
return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
|
126
|
+
seq, seq_rc = s[i, @size], s[i, @size].revcomp!
|
127
|
+
score_p, score_rc = score(seq), score(seq_rc)
|
128
|
+
r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
132
|
+
def eql?(pm)
|
133
|
+
return ['A','C','G','T'].inject(true) { |equal, letter|
|
134
|
+
equal = equal && @matrix[letter].eql?(pm.matrix[letter])
|
135
|
+
}
|
136
|
+
end
|
137
|
+
|
138
|
+
def flexeql?(pm)
|
139
|
+
checkerr("for what?") { true }
|
140
|
+
return ['A','C','G','T'].inject(true) { |equal, letter|
|
141
|
+
# report "letter=#{letter}"
|
142
|
+
equal = equal && (0...@size).inject(true) { |deepequal, position|
|
143
|
+
# report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
|
144
|
+
deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
|
145
|
+
}
|
146
|
+
}
|
147
|
+
end
|
148
|
+
|
149
|
+
def initialize(size, matrix = nil, words_count = nil)
|
150
|
+
checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
|
151
|
+
@size = size
|
152
|
+
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
153
|
+
if !words_count || words_count <= 0
|
154
|
+
words_count = col_sum(0)
|
155
|
+
@words_count = words_count.round >= 2 ? words_count.round : nil
|
156
|
+
else
|
157
|
+
@words_count = words_count
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def col_sum(index = 0, letset = ['A','C','G','T'])
|
162
|
+
return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
|
163
|
+
end
|
164
|
+
|
165
|
+
def PM.col_sum(matrix, index = 0)
|
166
|
+
return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
|
167
|
+
end
|
168
|
+
|
169
|
+
def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
170
|
+
@words_count = words_count if words_count && words_count > 0
|
171
|
+
|
172
|
+
@matrix.each_key do |letter|
|
173
|
+
(0...@size).each { |pos|
|
174
|
+
|
175
|
+
#p "pcm"
|
176
|
+
#p @matrix[letter][pos]
|
177
|
+
#p @matrix[letter][pos] + (probs[letter] * pseudocount)
|
178
|
+
#p ( (@words_count + pseudocount) * probs[letter])
|
179
|
+
#exit
|
180
|
+
|
181
|
+
@matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
|
182
|
+
|
183
|
+
}
|
184
|
+
end
|
185
|
+
|
186
|
+
return self
|
187
|
+
end
|
188
|
+
|
189
|
+
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
|
190
|
+
return self.dup.to_pwm!(words_count, probs, pseudocount)
|
191
|
+
end
|
192
|
+
alias to_pwm get_pwm
|
193
|
+
|
194
|
+
def get_ppm(words_count = nil)
|
195
|
+
words_count = @words_count unless words_count
|
196
|
+
checkerr("undefined words count") { !words_count || words_count <= 0 }
|
197
|
+
ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
198
|
+
@matrix.each_key { |letter|
|
199
|
+
(0...@size).each { |i|
|
200
|
+
ppm[letter][i] = @matrix[letter][i].to_f / words_count
|
201
|
+
}
|
202
|
+
}
|
203
|
+
return PPM.new(@size, ppm, words_count)
|
204
|
+
end
|
205
|
+
alias to_ppm get_ppm
|
206
|
+
|
207
|
+
def score(word)
|
208
|
+
checkerr("word size != pwm.size") { @size != word.size }
|
209
|
+
checkerr("word #{word} has strange characters") {
|
210
|
+
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
211
|
+
}
|
212
|
+
return (0...@size).inject(0) { |sum, i|
|
213
|
+
sum += @matrix[word[i,1]][i]
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
def best_score
|
218
|
+
return (0...size).inject(0) { |sum, i|
|
219
|
+
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
220
|
+
}
|
221
|
+
end
|
222
|
+
|
223
|
+
def worst_score
|
224
|
+
return (0...size).inject(0) { |sum, i|
|
225
|
+
sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
226
|
+
}
|
227
|
+
end
|
228
|
+
|
229
|
+
def dup
|
230
|
+
new_matrix = {}
|
231
|
+
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
232
|
+
return PM.new(@size, new_matrix, @words_count)
|
233
|
+
end
|
234
|
+
|
235
|
+
def PM.new_pcm(words, iupacomp = false)
|
236
|
+
size = words[0].size
|
237
|
+
counts = PM.new_matrix(size)
|
238
|
+
counts.each_value { |arr| arr.fill(0) }
|
239
|
+
words.each { |word|
|
240
|
+
0.upto(size-1) { |i|
|
241
|
+
letter = word[i,1].upcase
|
242
|
+
checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
|
243
|
+
if letter != 'N'
|
244
|
+
counts[letter][i] += 1
|
245
|
+
else
|
246
|
+
['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
|
247
|
+
end
|
248
|
+
}
|
249
|
+
}
|
250
|
+
newpcm = PM.new(size, counts, words.size)
|
251
|
+
newpcm.iupacomp! if iupacomp
|
252
|
+
return newpcm
|
253
|
+
end
|
254
|
+
|
255
|
+
def PM.new_pwm(words)
|
256
|
+
pcm = PM.new_pcm(words)
|
257
|
+
pcm.to_pwm!
|
258
|
+
return pcm
|
259
|
+
end
|
260
|
+
|
261
|
+
def PM.load(filename)
|
262
|
+
# supporting pat & pwm formats (letter-column and letter-row format)
|
263
|
+
input = IO.read(filename)
|
264
|
+
tm = []
|
265
|
+
input.each_line { |line|
|
266
|
+
l_a = line.split
|
267
|
+
begin
|
268
|
+
l_a = l_a.collect { |a_i| Float(a_i) }
|
269
|
+
rescue
|
270
|
+
next
|
271
|
+
end
|
272
|
+
tm << l_a
|
273
|
+
}
|
274
|
+
tm = tm.transpose if tm.size == 4
|
275
|
+
matrix = PM.new_matrix(tm.size)
|
276
|
+
tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
|
277
|
+
|
278
|
+
ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
|
279
|
+
|
280
|
+
return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
|
281
|
+
end
|
282
|
+
|
283
|
+
def save(filename)
|
284
|
+
File.open(filename, "w") { |out_f|
|
285
|
+
case File.ext_wo_name(filename)
|
286
|
+
when "pwm"
|
287
|
+
['A', 'C', 'G', 'T'].each { |letter|
|
288
|
+
@matrix[letter].each { |e|
|
289
|
+
out_f << "#{e} "
|
290
|
+
}
|
291
|
+
out_f << $/
|
292
|
+
}
|
293
|
+
when "pat"
|
294
|
+
out_f.puts File.name_wo_ext(filename)
|
295
|
+
(0...@size).each { |i|
|
296
|
+
['A', 'C', 'G', 'T'].each { |letter|
|
297
|
+
out_f << "#{@matrix[letter][i]} "
|
298
|
+
}
|
299
|
+
out_f << $/
|
300
|
+
}
|
301
|
+
when "xml"
|
302
|
+
checkerr("small-BiSMark is not supported at this moment")
|
303
|
+
else
|
304
|
+
checkerr("unknown motif file format specified")
|
305
|
+
end
|
306
|
+
}
|
307
|
+
end
|
308
|
+
|
309
|
+
def positiv!
|
310
|
+
min = @matrix.values.collect { |v| v.min }.min.abs
|
311
|
+
@matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
|
312
|
+
return self
|
313
|
+
end
|
314
|
+
|
315
|
+
def revcomp!
|
316
|
+
@matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
|
317
|
+
@matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
|
318
|
+
@matrix.each_value { |v| v.reverse! }
|
319
|
+
self
|
320
|
+
end
|
321
|
+
|
322
|
+
def to_bismark(b)
|
323
|
+
pwm = @matrix['A'][0].is_a?(Float)
|
324
|
+
attributes = {"length" => @size}
|
325
|
+
attributes["words-count"] = @words_count if @words_count && @words_count > 0
|
326
|
+
pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
|
327
|
+
(0...@matrix['A'].size).each { |i|
|
328
|
+
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
329
|
+
['A', 'C', 'G', 'T'].each { |l|
|
330
|
+
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
331
|
+
}
|
332
|
+
}
|
333
|
+
end
|
334
|
+
|
335
|
+
def PM.from_bismark(b, iupacomp = false)
|
336
|
+
|
337
|
+
checkerr("empty small-BiSMark file?") { !b }
|
338
|
+
float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
|
339
|
+
words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
|
340
|
+
|
341
|
+
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
342
|
+
b.elements.each("pm-column") { |pmc|
|
343
|
+
position = pmc.attributes["position"].to_i
|
344
|
+
['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
|
345
|
+
}
|
346
|
+
if b.name == "PPM"
|
347
|
+
newppm = PPM.new(matrix['A'].size, matrix, words_count)
|
348
|
+
newppm.iupacomp! if iupacomp
|
349
|
+
return newppm
|
350
|
+
end
|
351
|
+
if b.name == "PCM"
|
352
|
+
@words_count = col_sum(matrix)
|
353
|
+
newpcm = PM.new(matrix['A'].size, matrix, words_count)
|
354
|
+
newpcm.iupacomp! if iupacomp
|
355
|
+
return newpcm
|
356
|
+
end
|
357
|
+
if b.name == "PWM" && iupacomp
|
358
|
+
raise "cannot force IUPAC compatible PWM"
|
359
|
+
end
|
360
|
+
return PM.new(matrix['A'].size, matrix, words_count)
|
361
|
+
end
|
362
|
+
|
363
|
+
IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
|
364
|
+
def iupacomp!
|
365
|
+
@words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
|
366
|
+
# @words_count = @words_count.round < 2.0 ? nil : @words_count.round
|
367
|
+
|
368
|
+
IUPAC_LS.each { |iul_ls|
|
369
|
+
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
370
|
+
}
|
371
|
+
|
372
|
+
return self
|
373
|
+
end
|
374
|
+
|
375
|
+
def m3sd(bckgr = Randoom::DEF_PROBS)
|
376
|
+
|
377
|
+
mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
|
378
|
+
dev = (0...@size).inject(0.0) { |m2, i|
|
379
|
+
deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
|
380
|
+
m2 += deltai
|
381
|
+
}
|
382
|
+
sigma = Math.sqrt(dev)
|
383
|
+
|
384
|
+
mean+3*sigma
|
385
|
+
end
|
386
|
+
|
387
|
+
def fixwc
|
388
|
+
return unless @words_count
|
389
|
+
@words_count = (0...@size).collect { |i| col_sum(i) }.max
|
390
|
+
end
|
391
|
+
|
392
|
+
protected
|
393
|
+
def PM.new_matrix(size)
|
394
|
+
return {
|
395
|
+
'A' => Array.new(size),
|
396
|
+
'C' => Array.new(size),
|
397
|
+
'G' => Array.new(size),
|
398
|
+
'T' => Array.new(size) }
|
399
|
+
end
|
400
|
+
|
401
|
+
def PM.new_matrix_iupac(size)
|
402
|
+
return {
|
403
|
+
'A' => Array.new(size),
|
404
|
+
'C' => Array.new(size),
|
405
|
+
'G' => Array.new(size),
|
406
|
+
'T' => Array.new(size),
|
407
|
+
'R' => Array.new(size),
|
408
|
+
'Y' => Array.new(size),
|
409
|
+
'K' => Array.new(size),
|
410
|
+
'M' => Array.new(size),
|
411
|
+
'S' => Array.new(size),
|
412
|
+
'W' => Array.new(size),
|
413
|
+
'B' => Array.new(size),
|
414
|
+
'D' => Array.new(size),
|
415
|
+
'H' => Array.new(size),
|
416
|
+
'V' => Array.new(size),
|
417
|
+
'N' => Array.new(size)
|
418
|
+
}
|
419
|
+
end
|
420
|
+
|
421
|
+
end
|
422
|
+
|
423
|
+
class PPM < PM
|
424
|
+
|
425
|
+
#DEPRECATED, use iupacomp! instead
|
426
|
+
#def make_N_comp!
|
427
|
+
# @matrix['N'] = (0...size).collect { 0.25 }
|
428
|
+
# return self
|
429
|
+
#end
|
430
|
+
|
431
|
+
def initialize(size, matrix = nil, words_count = nil)
|
432
|
+
checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
|
433
|
+
@size = size
|
434
|
+
@matrix = matrix == nil ? PM.new_matrix(size) : matrix
|
435
|
+
@words_count = words_count
|
436
|
+
end
|
437
|
+
|
438
|
+
def iupacomp!
|
439
|
+
@words_count = 4.0 unless @words_count
|
440
|
+
|
441
|
+
IUPAC_LS.each { |iul_ls|
|
442
|
+
@matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
|
443
|
+
}
|
444
|
+
|
445
|
+
return self
|
446
|
+
end
|
447
|
+
|
448
|
+
def score(word)
|
449
|
+
checkerr("word size != ppm.size") { @size != word.size }
|
450
|
+
checkerr("word #{word} has strange characters") {
|
451
|
+
@matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
|
452
|
+
}
|
453
|
+
return (0...@size).inject(1) { |mul, i|
|
454
|
+
mul *= @matrix[word[i,1]][i]
|
455
|
+
}
|
456
|
+
end
|
457
|
+
|
458
|
+
def best_score
|
459
|
+
return (0...size).inject(1) { |mul, i|
|
460
|
+
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
|
461
|
+
}
|
462
|
+
end
|
463
|
+
|
464
|
+
def worst_score
|
465
|
+
return (0...size).inject(0) { |mul, i|
|
466
|
+
mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
|
467
|
+
}
|
468
|
+
end
|
469
|
+
|
470
|
+
def to_bismark(b)
|
471
|
+
attributes = {"length" => @size}
|
472
|
+
attributes["words-count"] = @words_count if @words_count
|
473
|
+
pe = b.add_element("PPM", attributes)
|
474
|
+
(0...@matrix['A'].size).each { |i|
|
475
|
+
pm_c = pe.add_element("pm-column", {"position" => i+1})
|
476
|
+
['A', 'C', 'G', 'T'].each { |l|
|
477
|
+
pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
|
478
|
+
}
|
479
|
+
}
|
480
|
+
end
|
481
|
+
|
482
|
+
def PPM.probs2IUPAC!(probs)
|
483
|
+
IUPAC_LS.each { |iul_ls|
|
484
|
+
probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
|
485
|
+
}
|
486
|
+
return probs
|
487
|
+
end
|
488
|
+
|
489
|
+
def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
|
490
|
+
|
491
|
+
probs = PPM.probs2IUPAC!(probs.dup)
|
492
|
+
|
493
|
+
words_count = @words_count if !words_count || words_count == 0
|
494
|
+
checkerr("undefined words count") { !words_count }
|
495
|
+
|
496
|
+
pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
|
497
|
+
|
498
|
+
@matrix.each_key do |letter|
|
499
|
+
(0...@size).each { |pos|
|
500
|
+
|
501
|
+
pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
|
502
|
+
|
503
|
+
}
|
504
|
+
end
|
505
|
+
return PM.new(@size, pwm, words_count)
|
506
|
+
#pcm = get_pcm(words_count)
|
507
|
+
#pcm.iupacomp! if @matrix['N']
|
508
|
+
#return pcm.to_pwm!(words_count, probs, pseudocount)
|
509
|
+
end
|
510
|
+
alias to_pwm get_pwm
|
511
|
+
|
512
|
+
def get_pwm0pc(probs = Randoom::DEF_PROBS)
|
513
|
+
new_matrix = {}
|
514
|
+
@matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
|
515
|
+
newpm = PM.new(@size, new_matrix, nil)
|
516
|
+
|
517
|
+
new_matrix.each_key do |letter|
|
518
|
+
(0...@size).each { |pos|
|
519
|
+
new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
|
520
|
+
}
|
521
|
+
end
|
522
|
+
|
523
|
+
return newpm
|
524
|
+
end
|
525
|
+
|
526
|
+
def to_pwm!
|
527
|
+
raise "cannot force PPM class to PWM, use to_pwm instead"
|
528
|
+
end
|
529
|
+
|
530
|
+
def get_pcm(words_count = nil)
|
531
|
+
words_count = @words_count unless words_count
|
532
|
+
checkerr("undefined words count") { !words_count }
|
533
|
+
counts = PM.new_matrix(@size)
|
534
|
+
(0...size).each { |i|
|
535
|
+
['A', 'C', 'G', 'T'].each { |l|
|
536
|
+
counts[l][i] = @matrix[l][i] * words_count
|
537
|
+
}
|
538
|
+
}
|
539
|
+
newpcm = PM.new(size, counts, words_count).iupacomp!
|
540
|
+
return newpcm
|
541
|
+
end
|
542
|
+
alias to_pcm get_pcm
|
543
|
+
|
544
|
+
def PPM.from_IUPAC(iupac)
|
545
|
+
matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
|
546
|
+
|
547
|
+
(0...iupac.size).each { |i|
|
548
|
+
matrix.each_key { |k| matrix[k] << 0.0 }
|
549
|
+
letters = IUPAC::REVCODE[iupac[i]]
|
550
|
+
(0...letters.size).each { |j|
|
551
|
+
matrix[letters[j]][-1] = 1.0/letters.size
|
552
|
+
}
|
553
|
+
}
|
554
|
+
|
555
|
+
newppm = PPM.new(iupac.size, matrix, 4.0)
|
556
|
+
newppm.iupacomp!
|
557
|
+
|
558
|
+
newppm
|
559
|
+
end
|
560
|
+
|
561
|
+
end
|
562
|
+
end
|