sequence_logo 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,562 +1,562 @@
1
- module Ytilib
2
- class PM
3
-
4
- attr_reader :matrix, :size
5
- attr_accessor :words_count
6
-
7
- alias length size
8
-
9
- def score_mean(bckgr = Randoom::DEF_PROBS)
10
- (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
11
- end
12
-
13
- def score_variance(bckgr = Randoom::DEF_PROBS)
14
- (0...@size).inject(0.0) { |m2, i|
15
- deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
16
- m2 += deltai
17
- }
18
- end
19
-
20
- def p_value(threshold, mean = nil, variance = nil)
21
- mean = mean ? mean : score_mean
22
- variance = variance ? variance : score_variance
23
- n_ = (threshold - mean) / Math.sqrt(variance)
24
- p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
25
- end
26
-
27
- def best_word
28
- return (0...size).inject("") { |word, i|
29
- max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
30
- maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
31
- word << (maxlets.size == 1 ? maxlets.first : "N")
32
- }
33
- end
34
-
35
- def strict_consensus
36
- return IUPAC.new((0...size).inject("") { |word, i|
37
- max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
38
- maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
39
- word += IUPAC::CODE[maxlets]
40
- })
41
- end
42
-
43
- def consensus_string(beautiful = false)
44
- checkerr("words count is undefined") { !@words_count }
45
- i2o4, thc, tlc = icd2of4, icdThc, icdTlc
46
- icd = infocod
47
-
48
- return String.new((0...size).inject("") { |word, i|
49
-
50
- scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
51
-
52
- if icd[i] > i2o4
53
- scores = [scores.first]
54
- elsif icd[i] > thc
55
- scores = scores[0..1]
56
- elsif icd[i] > tlc
57
- scores = scores[0..2]
58
- end
59
-
60
- lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
61
-
62
- reslet = IUPAC::CODE[lets]
63
- reslet = reslet.downcase if beautiful && lets.size > 2
64
-
65
- word += reslet
66
- })
67
- end
68
-
69
- def consensus
70
- checkerr("words count is undefined") { !@words_count }
71
- i2o4, thc, tlc = icd2of4, icdThc, icdTlc
72
- icd = infocod
73
-
74
- return IUPAC.new((0...size).inject("") { |word, i|
75
-
76
- scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
77
-
78
- if icd[i] > i2o4
79
- scores = [scores.first]
80
- elsif icd[i] > thc
81
- scores = scores[0..1]
82
- elsif icd[i] > tlc
83
- scores = scores[0..2]
84
- end
85
-
86
- lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
87
-
88
- word += IUPAC::CODE[lets]
89
- })
90
- end
91
-
92
- def find_hit(s, score_g, use2strands = true)
93
- (0..(s.size - @size)).each { |i|
94
- seq, seq_rc = s[i, @size], s[i, @size].revcomp!
95
- score_p, score_rc = score(seq), score(seq_rc)
96
- r = use2strands ? [score_p,score_rc].max : score_p
97
- return i if r >= score_g
98
- }
99
- return nil
100
- end
101
-
102
- def find_hits(s, score_g, use2strands = true)
103
- (0..(s.size - @size)).select { |i|
104
- seq, seq_rc = s[i, @size], s[i, @size].revcomp!
105
- score_p, score_rc = score(seq), score(seq_rc)
106
- r = use2strands ? [score_p,score_rc].max : score_p
107
- r >= score_g ? i : nil
108
- }.compact
109
- end
110
-
111
- def collect_hits(s, score_g, use2strands = true)
112
- result = []
113
- (0..(s.size - @size)).each { |i|
114
- seq, seq_rc = s[i, @size], s[i, @size].revcomp!
115
- score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
116
- result << [score_p, seq, false, i] if score_p >= score_g
117
- result << [score_rc, seq_rc, true, i] if score_rc >= score_g
118
- }
119
- result
120
- end
121
-
122
- def best_hit(s, use2strands = true)
123
-
124
- checkerr("too short sequence") { s.size < @size }
125
- return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
126
- seq, seq_rc = s[i, @size], s[i, @size].revcomp!
127
- score_p, score_rc = score(seq), score(seq_rc)
128
- r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
129
- }
130
- end
131
-
132
- def eql?(pm)
133
- return ['A','C','G','T'].inject(true) { |equal, letter|
134
- equal = equal && @matrix[letter].eql?(pm.matrix[letter])
135
- }
136
- end
137
-
138
- def flexeql?(pm)
139
- checkerr("for what?") { true }
140
- return ['A','C','G','T'].inject(true) { |equal, letter|
141
- # report "letter=#{letter}"
142
- equal = equal && (0...@size).inject(true) { |deepequal, position|
143
- # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
144
- deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
145
- }
146
- }
147
- end
148
-
149
- def initialize(size, matrix = nil, words_count = nil)
150
- checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
151
- @size = size
152
- @matrix = matrix == nil ? PM.new_matrix(size) : matrix
153
- if !words_count || words_count <= 0
154
- words_count = col_sum(0)
155
- @words_count = words_count.round >= 2 ? words_count.round : nil
156
- else
157
- @words_count = words_count
158
- end
159
- end
160
-
161
- def col_sum(index = 0, letset = ['A','C','G','T'])
162
- return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
163
- end
164
-
165
- def PM.col_sum(matrix, index = 0)
166
- return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
167
- end
168
-
169
- def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
170
- @words_count = words_count if words_count && words_count > 0
171
-
172
- @matrix.each_key do |letter|
173
- (0...@size).each { |pos|
174
-
175
- #p "pcm"
176
- #p @matrix[letter][pos]
177
- #p @matrix[letter][pos] + (probs[letter] * pseudocount)
178
- #p ( (@words_count + pseudocount) * probs[letter])
179
- #exit
180
-
181
- @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
182
-
183
- }
184
- end
185
-
186
- return self
187
- end
188
-
189
- def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
190
- return self.dup.to_pwm!(words_count, probs, pseudocount)
191
- end
192
- alias to_pwm get_pwm
193
-
194
- def get_ppm(words_count = nil)
195
- words_count = @words_count unless words_count
196
- checkerr("undefined words count") { !words_count || words_count <= 0 }
197
- ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
198
- @matrix.each_key { |letter|
199
- (0...@size).each { |i|
200
- ppm[letter][i] = @matrix[letter][i].to_f / words_count
201
- }
202
- }
203
- return PPM.new(@size, ppm, words_count)
204
- end
205
- alias to_ppm get_ppm
206
-
207
- def score(word)
208
- checkerr("word size != pwm.size") { @size != word.size }
209
- checkerr("word #{word} has strange characters") {
210
- @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
211
- }
212
- return (0...@size).inject(0) { |sum, i|
213
- sum += @matrix[word[i,1]][i]
214
- }
215
- end
216
-
217
- def best_score
218
- return (0...size).inject(0) { |sum, i|
219
- sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
220
- }
221
- end
222
-
223
- def worst_score
224
- return (0...size).inject(0) { |sum, i|
225
- sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
226
- }
227
- end
228
-
229
- def dup
230
- new_matrix = {}
231
- @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
232
- return PM.new(@size, new_matrix, @words_count)
233
- end
234
-
235
- def PM.new_pcm(words, iupacomp = false)
236
- size = words[0].size
237
- counts = PM.new_matrix(size)
238
- counts.each_value { |arr| arr.fill(0) }
239
- words.each { |word|
240
- 0.upto(size-1) { |i|
241
- letter = word[i,1].upcase
242
- checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
243
- if letter != 'N'
244
- counts[letter][i] += 1
245
- else
246
- ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
247
- end
248
- }
249
- }
250
- newpcm = PM.new(size, counts, words.size)
251
- newpcm.iupacomp! if iupacomp
252
- return newpcm
253
- end
254
-
255
- def PM.new_pwm(words)
256
- pcm = PM.new_pcm(words)
257
- pcm.to_pwm!
258
- return pcm
259
- end
260
-
261
- def PM.load(filename)
262
- # supporting pat & pwm formats (letter-column and letter-row format)
263
- input = IO.read(filename)
264
- tm = []
265
- input.each_line { |line|
266
- l_a = line.split
267
- begin
268
- l_a = l_a.collect { |a_i| Float(a_i) }
269
- rescue
270
- next
271
- end
272
- tm << l_a
273
- }
274
- tm = tm.transpose if tm.size == 4
275
- matrix = PM.new_matrix(tm.size)
276
- tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
277
-
278
- ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
279
-
280
- return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
281
- end
282
-
283
- def save(filename)
284
- File.open(filename, "w") { |out_f|
285
- case File.ext_wo_name(filename)
286
- when "pwm"
287
- ['A', 'C', 'G', 'T'].each { |letter|
288
- @matrix[letter].each { |e|
289
- out_f << "#{e} "
290
- }
291
- out_f << $/
292
- }
293
- when "pat"
294
- out_f.puts File.name_wo_ext(filename)
295
- (0...@size).each { |i|
296
- ['A', 'C', 'G', 'T'].each { |letter|
297
- out_f << "#{@matrix[letter][i]} "
298
- }
299
- out_f << $/
300
- }
301
- when "xml"
302
- checkerr("small-BiSMark is not supported at this moment")
303
- else
304
- checkerr("unknown motif file format specified")
305
- end
306
- }
307
- end
308
-
309
- def positiv!
310
- min = @matrix.values.collect { |v| v.min }.min.abs
311
- @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
312
- return self
313
- end
314
-
315
- def revcomp!
316
- @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
317
- @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
318
- @matrix.each_value { |v| v.reverse! }
319
- self
320
- end
321
-
322
- def to_bismark(b)
323
- pwm = @matrix['A'][0].is_a?(Float)
324
- attributes = {"length" => @size}
325
- attributes["words-count"] = @words_count if @words_count && @words_count > 0
326
- pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
327
- (0...@matrix['A'].size).each { |i|
328
- pm_c = pe.add_element("pm-column", {"position" => i+1})
329
- ['A', 'C', 'G', 'T'].each { |l|
330
- pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
331
- }
332
- }
333
- end
334
-
335
- def PM.from_bismark(b, iupacomp = false)
336
-
337
- checkerr("empty small-BiSMark file?") { !b }
338
- float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
339
- words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
340
-
341
- matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
342
- b.elements.each("pm-column") { |pmc|
343
- position = pmc.attributes["position"].to_i
344
- ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
345
- }
346
- if b.name == "PPM"
347
- newppm = PPM.new(matrix['A'].size, matrix, words_count)
348
- newppm.iupacomp! if iupacomp
349
- return newppm
350
- end
351
- if b.name == "PCM"
352
- @words_count = col_sum(matrix)
353
- newpcm = PM.new(matrix['A'].size, matrix, words_count)
354
- newpcm.iupacomp! if iupacomp
355
- return newpcm
356
- end
357
- if b.name == "PWM" && iupacomp
358
- raise "cannot force IUPAC compatible PWM"
359
- end
360
- return PM.new(matrix['A'].size, matrix, words_count)
361
- end
362
-
363
- IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
364
- def iupacomp!
365
- @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
366
- # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
367
-
368
- IUPAC_LS.each { |iul_ls|
369
- @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
370
- }
371
-
372
- return self
373
- end
374
-
375
- def m3sd(bckgr = Randoom::DEF_PROBS)
376
-
377
- mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
378
- dev = (0...@size).inject(0.0) { |m2, i|
379
- deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
380
- m2 += deltai
381
- }
382
- sigma = Math.sqrt(dev)
383
-
384
- mean+3*sigma
385
- end
386
-
387
- def fixwc
388
- return unless @words_count
389
- @words_count = (0...@size).collect { |i| col_sum(i) }.max
390
- end
391
-
392
- protected
393
- def PM.new_matrix(size)
394
- return {
395
- 'A' => Array.new(size),
396
- 'C' => Array.new(size),
397
- 'G' => Array.new(size),
398
- 'T' => Array.new(size) }
399
- end
400
-
401
- def PM.new_matrix_iupac(size)
402
- return {
403
- 'A' => Array.new(size),
404
- 'C' => Array.new(size),
405
- 'G' => Array.new(size),
406
- 'T' => Array.new(size),
407
- 'R' => Array.new(size),
408
- 'Y' => Array.new(size),
409
- 'K' => Array.new(size),
410
- 'M' => Array.new(size),
411
- 'S' => Array.new(size),
412
- 'W' => Array.new(size),
413
- 'B' => Array.new(size),
414
- 'D' => Array.new(size),
415
- 'H' => Array.new(size),
416
- 'V' => Array.new(size),
417
- 'N' => Array.new(size)
418
- }
419
- end
420
-
421
- end
422
-
423
- class PPM < PM
424
-
425
- #DEPRECATED, use iupacomp! instead
426
- #def make_N_comp!
427
- # @matrix['N'] = (0...size).collect { 0.25 }
428
- # return self
429
- #end
430
-
431
- def initialize(size, matrix = nil, words_count = nil)
432
- checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
433
- @size = size
434
- @matrix = matrix == nil ? PM.new_matrix(size) : matrix
435
- @words_count = words_count
436
- end
437
-
438
- def iupacomp!
439
- @words_count = 4.0 unless @words_count
440
-
441
- IUPAC_LS.each { |iul_ls|
442
- @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
443
- }
444
-
445
- return self
446
- end
447
-
448
- def score(word)
449
- checkerr("word size != ppm.size") { @size != word.size }
450
- checkerr("word #{word} has strange characters") {
451
- @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
452
- }
453
- return (0...@size).inject(1) { |mul, i|
454
- mul *= @matrix[word[i,1]][i]
455
- }
456
- end
457
-
458
- def best_score
459
- return (0...size).inject(1) { |mul, i|
460
- mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
461
- }
462
- end
463
-
464
- def worst_score
465
- return (0...size).inject(0) { |mul, i|
466
- mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
467
- }
468
- end
469
-
470
- def to_bismark(b)
471
- attributes = {"length" => @size}
472
- attributes["words-count"] = @words_count if @words_count
473
- pe = b.add_element("PPM", attributes)
474
- (0...@matrix['A'].size).each { |i|
475
- pm_c = pe.add_element("pm-column", {"position" => i+1})
476
- ['A', 'C', 'G', 'T'].each { |l|
477
- pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
478
- }
479
- }
480
- end
481
-
482
- def PPM.probs2IUPAC!(probs)
483
- IUPAC_LS.each { |iul_ls|
484
- probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
485
- }
486
- return probs
487
- end
488
-
489
- def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
490
-
491
- probs = PPM.probs2IUPAC!(probs.dup)
492
-
493
- words_count = @words_count if !words_count || words_count == 0
494
- checkerr("undefined words count") { !words_count }
495
-
496
- pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
497
-
498
- @matrix.each_key do |letter|
499
- (0...@size).each { |pos|
500
-
501
- pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
502
-
503
- }
504
- end
505
- return PM.new(@size, pwm, words_count)
506
- #pcm = get_pcm(words_count)
507
- #pcm.iupacomp! if @matrix['N']
508
- #return pcm.to_pwm!(words_count, probs, pseudocount)
509
- end
510
- alias to_pwm get_pwm
511
-
512
- def get_pwm0pc(probs = Randoom::DEF_PROBS)
513
- new_matrix = {}
514
- @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
515
- newpm = PM.new(@size, new_matrix, nil)
516
-
517
- new_matrix.each_key do |letter|
518
- (0...@size).each { |pos|
519
- new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
520
- }
521
- end
522
-
523
- return newpm
524
- end
525
-
526
- def to_pwm!
527
- raise "cannot force PPM class to PWM, use to_pwm instead"
528
- end
529
-
530
- def get_pcm(words_count = nil)
531
- words_count = @words_count unless words_count
532
- checkerr("undefined words count") { !words_count }
533
- counts = PM.new_matrix(@size)
534
- (0...size).each { |i|
535
- ['A', 'C', 'G', 'T'].each { |l|
536
- counts[l][i] = @matrix[l][i] * words_count
537
- }
538
- }
539
- newpcm = PM.new(size, counts, words_count).iupacomp!
540
- return newpcm
541
- end
542
- alias to_pcm get_pcm
543
-
544
- def PPM.from_IUPAC(iupac)
545
- matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
546
-
547
- (0...iupac.size).each { |i|
548
- matrix.each_key { |k| matrix[k] << 0.0 }
549
- letters = IUPAC::REVCODE[iupac[i]]
550
- (0...letters.size).each { |j|
551
- matrix[letters[j]][-1] = 1.0/letters.size
552
- }
553
- }
554
-
555
- newppm = PPM.new(iupac.size, matrix, 4.0)
556
- newppm.iupacomp!
557
-
558
- newppm
559
- end
560
-
561
- end
562
- end
1
+ module Ytilib
2
+ class PM
3
+
4
+ attr_reader :matrix, :size
5
+ attr_accessor :words_count
6
+
7
+ alias length size
8
+
9
+ def score_mean(bckgr = Randoom::DEF_PROBS)
10
+ (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
11
+ end
12
+
13
+ def score_variance(bckgr = Randoom::DEF_PROBS)
14
+ (0...@size).inject(0.0) { |m2, i|
15
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
16
+ m2 += deltai
17
+ }
18
+ end
19
+
20
+ def p_value(threshold, mean = nil, variance = nil)
21
+ mean = mean ? mean : score_mean
22
+ variance = variance ? variance : score_variance
23
+ n_ = (threshold - mean) / Math.sqrt(variance)
24
+ p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
25
+ end
26
+
27
+ def best_word
28
+ return (0...size).inject("") { |word, i|
29
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
30
+ maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
31
+ word << (maxlets.size == 1 ? maxlets.first : "N")
32
+ }
33
+ end
34
+
35
+ def strict_consensus
36
+ return IUPAC.new((0...size).inject("") { |word, i|
37
+ max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
38
+ maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
39
+ word += IUPAC::CODE[maxlets]
40
+ })
41
+ end
42
+
43
+ def consensus_string(beautiful = false)
44
+ checkerr("words count is undefined") { !@words_count }
45
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
46
+ icd = infocod
47
+
48
+ return String.new((0...size).inject("") { |word, i|
49
+
50
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
51
+
52
+ if icd[i] > i2o4
53
+ scores = [scores.first]
54
+ elsif icd[i] > thc
55
+ scores = scores[0..1]
56
+ elsif icd[i] > tlc
57
+ scores = scores[0..2]
58
+ end
59
+
60
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
61
+
62
+ reslet = IUPAC::CODE[lets]
63
+ reslet = reslet.downcase if beautiful && lets.size > 2
64
+
65
+ word += reslet
66
+ })
67
+ end
68
+
69
+ def consensus
70
+ checkerr("words count is undefined") { !@words_count }
71
+ i2o4, thc, tlc = icd2of4, icdThc, icdTlc
72
+ icd = infocod
73
+
74
+ return IUPAC.new((0...size).inject("") { |word, i|
75
+
76
+ scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
77
+
78
+ if icd[i] > i2o4
79
+ scores = [scores.first]
80
+ elsif icd[i] > thc
81
+ scores = scores[0..1]
82
+ elsif icd[i] > tlc
83
+ scores = scores[0..2]
84
+ end
85
+
86
+ lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
87
+
88
+ word += IUPAC::CODE[lets]
89
+ })
90
+ end
91
+
92
+ def find_hit(s, score_g, use2strands = true)
93
+ (0..(s.size - @size)).each { |i|
94
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
95
+ score_p, score_rc = score(seq), score(seq_rc)
96
+ r = use2strands ? [score_p,score_rc].max : score_p
97
+ return i if r >= score_g
98
+ }
99
+ return nil
100
+ end
101
+
102
+ def find_hits(s, score_g, use2strands = true)
103
+ (0..(s.size - @size)).select { |i|
104
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
105
+ score_p, score_rc = score(seq), score(seq_rc)
106
+ r = use2strands ? [score_p,score_rc].max : score_p
107
+ r >= score_g ? i : nil
108
+ }.compact
109
+ end
110
+
111
+ def collect_hits(s, score_g, use2strands = true)
112
+ result = []
113
+ (0..(s.size - @size)).each { |i|
114
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
115
+ score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
116
+ result << [score_p, seq, false, i] if score_p >= score_g
117
+ result << [score_rc, seq_rc, true, i] if score_rc >= score_g
118
+ }
119
+ result
120
+ end
121
+
122
+ def best_hit(s, use2strands = true)
123
+
124
+ checkerr("too short sequence") { s.size < @size }
125
+ return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
126
+ seq, seq_rc = s[i, @size], s[i, @size].revcomp!
127
+ score_p, score_rc = score(seq), score(seq_rc)
128
+ r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
129
+ }
130
+ end
131
+
132
+ def eql?(pm)
133
+ return ['A','C','G','T'].inject(true) { |equal, letter|
134
+ equal = equal && @matrix[letter].eql?(pm.matrix[letter])
135
+ }
136
+ end
137
+
138
+ def flexeql?(pm)
139
+ checkerr("for what?") { true }
140
+ return ['A','C','G','T'].inject(true) { |equal, letter|
141
+ # report "letter=#{letter}"
142
+ equal = equal && (0...@size).inject(true) { |deepequal, position|
143
+ # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
144
+ deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11
145
+ }
146
+ }
147
+ end
148
+
149
+ def initialize(size, matrix = nil, words_count = nil)
150
+ checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
151
+ @size = size
152
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
153
+ if !words_count || words_count <= 0
154
+ words_count = col_sum(0)
155
+ @words_count = words_count.round >= 2 ? words_count.round : nil
156
+ else
157
+ @words_count = words_count
158
+ end
159
+ end
160
+
161
+ def col_sum(index = 0, letset = ['A','C','G','T'])
162
+ return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
163
+ end
164
+
165
+ def PM.col_sum(matrix, index = 0)
166
+ return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
167
+ end
168
+
169
+ def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
170
+ @words_count = words_count if words_count && words_count > 0
171
+
172
+ @matrix.each_key do |letter|
173
+ (0...@size).each { |pos|
174
+
175
+ #p "pcm"
176
+ #p @matrix[letter][pos]
177
+ #p @matrix[letter][pos] + (probs[letter] * pseudocount)
178
+ #p ( (@words_count + pseudocount) * probs[letter])
179
+ #exit
180
+
181
+ @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
182
+
183
+ }
184
+ end
185
+
186
+ return self
187
+ end
188
+
189
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
190
+ return self.dup.to_pwm!(words_count, probs, pseudocount)
191
+ end
192
+ alias to_pwm get_pwm
193
+
194
+ def get_ppm(words_count = nil)
195
+ words_count = @words_count unless words_count
196
+ checkerr("undefined words count") { !words_count || words_count <= 0 }
197
+ ppm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
198
+ @matrix.each_key { |letter|
199
+ (0...@size).each { |i|
200
+ ppm[letter][i] = @matrix[letter][i].to_f / words_count
201
+ }
202
+ }
203
+ return PPM.new(@size, ppm, words_count)
204
+ end
205
+ alias to_ppm get_ppm
206
+
207
+ def score(word)
208
+ checkerr("word size != pwm.size") { @size != word.size }
209
+ checkerr("word #{word} has strange characters") {
210
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
211
+ }
212
+ return (0...@size).inject(0) { |sum, i|
213
+ sum += @matrix[word[i,1]][i]
214
+ }
215
+ end
216
+
217
+ def best_score
218
+ return (0...size).inject(0) { |sum, i|
219
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
220
+ }
221
+ end
222
+
223
+ def worst_score
224
+ return (0...size).inject(0) { |sum, i|
225
+ sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
226
+ }
227
+ end
228
+
229
+ def dup
230
+ new_matrix = {}
231
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
232
+ return PM.new(@size, new_matrix, @words_count)
233
+ end
234
+
235
+ def PM.new_pcm(words, iupacomp = false)
236
+ size = words[0].size
237
+ counts = PM.new_matrix(size)
238
+ counts.each_value { |arr| arr.fill(0) }
239
+ words.each { |word|
240
+ 0.upto(size-1) { |i|
241
+ letter = word[i,1].upcase
242
+ checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
243
+ if letter != 'N'
244
+ counts[letter][i] += 1
245
+ else
246
+ ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
247
+ end
248
+ }
249
+ }
250
+ newpcm = PM.new(size, counts, words.size)
251
+ newpcm.iupacomp! if iupacomp
252
+ return newpcm
253
+ end
254
+
255
+ def PM.new_pwm(words)
256
+ pcm = PM.new_pcm(words)
257
+ pcm.to_pwm!
258
+ return pcm
259
+ end
260
+
261
+ def PM.load(filename)
262
+ # supporting pat & pwm formats (letter-column and letter-row format)
263
+ input = IO.read(filename)
264
+ tm = []
265
+ input.each_line { |line|
266
+ l_a = line.split
267
+ begin
268
+ l_a = l_a.collect { |a_i| Float(a_i) }
269
+ rescue
270
+ next
271
+ end
272
+ tm << l_a
273
+ }
274
+ tm = tm.transpose if tm.size == 4
275
+ matrix = PM.new_matrix(tm.size)
276
+ tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] } }
277
+
278
+ ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
279
+
280
+ return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
281
+ end
282
+
283
+ def save(filename)
284
+ File.open(filename, "w") { |out_f|
285
+ case File.ext_wo_name(filename)
286
+ when "pwm"
287
+ ['A', 'C', 'G', 'T'].each { |letter|
288
+ @matrix[letter].each { |e|
289
+ out_f << "#{e} "
290
+ }
291
+ out_f << $/
292
+ }
293
+ when "pat"
294
+ out_f.puts File.name_wo_ext(filename)
295
+ (0...@size).each { |i|
296
+ ['A', 'C', 'G', 'T'].each { |letter|
297
+ out_f << "#{@matrix[letter][i]} "
298
+ }
299
+ out_f << $/
300
+ }
301
+ when "xml"
302
+ checkerr("small-BiSMark is not supported at this moment")
303
+ else
304
+ checkerr("unknown motif file format specified")
305
+ end
306
+ }
307
+ end
308
+
309
+ def positiv!
310
+ min = @matrix.values.collect { |v| v.min }.min.abs
311
+ @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
312
+ return self
313
+ end
314
+
315
+ def revcomp!
316
+ @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
317
+ @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
318
+ @matrix.each_value { |v| v.reverse! }
319
+ self
320
+ end
321
+
322
+ def to_bismark(b)
323
+ pwm = @matrix['A'][0].is_a?(Float)
324
+ attributes = {"length" => @size}
325
+ attributes["words-count"] = @words_count if @words_count && @words_count > 0
326
+ pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
327
+ (0...@matrix['A'].size).each { |i|
328
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
329
+ ['A', 'C', 'G', 'T'].each { |l|
330
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
331
+ }
332
+ }
333
+ end
334
+
335
+ def PM.from_bismark(b, iupacomp = false)
336
+
337
+ checkerr("empty small-BiSMark file?") { !b }
338
+ float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
339
+ words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
340
+
341
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
342
+ b.elements.each("pm-column") { |pmc|
343
+ position = pmc.attributes["position"].to_i
344
+ ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
345
+ }
346
+ if b.name == "PPM"
347
+ newppm = PPM.new(matrix['A'].size, matrix, words_count)
348
+ newppm.iupacomp! if iupacomp
349
+ return newppm
350
+ end
351
+ if b.name == "PCM"
352
+ @words_count = col_sum(matrix)
353
+ newpcm = PM.new(matrix['A'].size, matrix, words_count)
354
+ newpcm.iupacomp! if iupacomp
355
+ return newpcm
356
+ end
357
+ if b.name == "PWM" && iupacomp
358
+ raise "cannot force IUPAC compatible PWM"
359
+ end
360
+ return PM.new(matrix['A'].size, matrix, words_count)
361
+ end
362
+
363
+ IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
364
+ def iupacomp!
365
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
366
+ # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
367
+
368
+ IUPAC_LS.each { |iul_ls|
369
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
370
+ }
371
+
372
+ return self
373
+ end
374
+
375
+ def m3sd(bckgr = Randoom::DEF_PROBS)
376
+
377
+ mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
378
+ dev = (0...@size).inject(0.0) { |m2, i|
379
+ deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
380
+ m2 += deltai
381
+ }
382
+ sigma = Math.sqrt(dev)
383
+
384
+ mean+3*sigma
385
+ end
386
+
387
+ def fixwc
388
+ return unless @words_count
389
+ @words_count = (0...@size).collect { |i| col_sum(i) }.max
390
+ end
391
+
392
+ protected
393
+ def PM.new_matrix(size)
394
+ return {
395
+ 'A' => Array.new(size),
396
+ 'C' => Array.new(size),
397
+ 'G' => Array.new(size),
398
+ 'T' => Array.new(size) }
399
+ end
400
+
401
+ def PM.new_matrix_iupac(size)
402
+ return {
403
+ 'A' => Array.new(size),
404
+ 'C' => Array.new(size),
405
+ 'G' => Array.new(size),
406
+ 'T' => Array.new(size),
407
+ 'R' => Array.new(size),
408
+ 'Y' => Array.new(size),
409
+ 'K' => Array.new(size),
410
+ 'M' => Array.new(size),
411
+ 'S' => Array.new(size),
412
+ 'W' => Array.new(size),
413
+ 'B' => Array.new(size),
414
+ 'D' => Array.new(size),
415
+ 'H' => Array.new(size),
416
+ 'V' => Array.new(size),
417
+ 'N' => Array.new(size)
418
+ }
419
+ end
420
+
421
+ end
422
+
423
+ class PPM < PM
424
+
425
+ #DEPRECATED, use iupacomp! instead
426
+ #def make_N_comp!
427
+ # @matrix['N'] = (0...size).collect { 0.25 }
428
+ # return self
429
+ #end
430
+
431
+ def initialize(size, matrix = nil, words_count = nil)
432
+ checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
433
+ @size = size
434
+ @matrix = matrix == nil ? PM.new_matrix(size) : matrix
435
+ @words_count = words_count
436
+ end
437
+
438
+ def iupacomp!
439
+ @words_count = 4.0 unless @words_count
440
+
441
+ IUPAC_LS.each { |iul_ls|
442
+ @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
443
+ }
444
+
445
+ return self
446
+ end
447
+
448
+ def score(word)
449
+ checkerr("word size != ppm.size") { @size != word.size }
450
+ checkerr("word #{word} has strange characters") {
451
+ @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
452
+ }
453
+ return (0...@size).inject(1) { |mul, i|
454
+ mul *= @matrix[word[i,1]][i]
455
+ }
456
+ end
457
+
458
+ def best_score
459
+ return (0...size).inject(1) { |mul, i|
460
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
461
+ }
462
+ end
463
+
464
+ def worst_score
465
+ return (0...size).inject(0) { |mul, i|
466
+ mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
467
+ }
468
+ end
469
+
470
+ def to_bismark(b)
471
+ attributes = {"length" => @size}
472
+ attributes["words-count"] = @words_count if @words_count
473
+ pe = b.add_element("PPM", attributes)
474
+ (0...@matrix['A'].size).each { |i|
475
+ pm_c = pe.add_element("pm-column", {"position" => i+1})
476
+ ['A', 'C', 'G', 'T'].each { |l|
477
+ pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
478
+ }
479
+ }
480
+ end
481
+
482
+ def PPM.probs2IUPAC!(probs)
483
+ IUPAC_LS.each { |iul_ls|
484
+ probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
485
+ }
486
+ return probs
487
+ end
488
+
489
+ def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
490
+
491
+ probs = PPM.probs2IUPAC!(probs.dup)
492
+
493
+ words_count = @words_count if !words_count || words_count == 0
494
+ checkerr("undefined words count") { !words_count }
495
+
496
+ pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
497
+
498
+ @matrix.each_key do |letter|
499
+ (0...@size).each { |pos|
500
+
501
+ pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
502
+
503
+ }
504
+ end
505
+ return PM.new(@size, pwm, words_count)
506
+ #pcm = get_pcm(words_count)
507
+ #pcm.iupacomp! if @matrix['N']
508
+ #return pcm.to_pwm!(words_count, probs, pseudocount)
509
+ end
510
+ alias to_pwm get_pwm
511
+
512
+ def get_pwm0pc(probs = Randoom::DEF_PROBS)
513
+ new_matrix = {}
514
+ @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
515
+ newpm = PM.new(@size, new_matrix, nil)
516
+
517
+ new_matrix.each_key do |letter|
518
+ (0...@size).each { |pos|
519
+ new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
520
+ }
521
+ end
522
+
523
+ return newpm
524
+ end
525
+
526
+ def to_pwm!
527
+ raise "cannot force PPM class to PWM, use to_pwm instead"
528
+ end
529
+
530
+ def get_pcm(words_count = nil)
531
+ words_count = @words_count unless words_count
532
+ checkerr("undefined words count") { !words_count }
533
+ counts = PM.new_matrix(@size)
534
+ (0...size).each { |i|
535
+ ['A', 'C', 'G', 'T'].each { |l|
536
+ counts[l][i] = @matrix[l][i] * words_count
537
+ }
538
+ }
539
+ newpcm = PM.new(size, counts, words_count).iupacomp!
540
+ return newpcm
541
+ end
542
+ alias to_pcm get_pcm
543
+
544
+ def PPM.from_IUPAC(iupac)
545
+ matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
546
+
547
+ (0...iupac.size).each { |i|
548
+ matrix.each_key { |k| matrix[k] << 0.0 }
549
+ letters = IUPAC::REVCODE[iupac[i]]
550
+ (0...letters.size).each { |j|
551
+ matrix[letters[j]][-1] = 1.0/letters.size
552
+ }
553
+ }
554
+
555
+ newppm = PPM.new(iupac.size, matrix, 4.0)
556
+ newppm.iupacomp!
557
+
558
+ newppm
559
+ end
560
+
561
+ end
562
+ end