sequence_logo 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -17
- data/Gemfile +4 -4
- data/LICENSE +21 -21
- data/README.md +65 -65
- data/Rakefile +5 -5
- data/TODO.txt +7 -7
- data/bin/glue_logos +2 -2
- data/bin/sequence_logo +2 -2
- data/lib/sequence_logo/cli.rb +36 -36
- data/lib/sequence_logo/exec/glue_logos.rb +97 -66
- data/lib/sequence_logo/exec/sequence_logo.rb +51 -51
- data/lib/sequence_logo/pmflogo_lib.rb +113 -113
- data/lib/sequence_logo/version.rb +3 -3
- data/lib/sequence_logo/ytilib/addon.rb +246 -246
- data/lib/sequence_logo/ytilib/bismark.rb +70 -70
- data/lib/sequence_logo/ytilib/hack1.rb +75 -75
- data/lib/sequence_logo/ytilib/infocod.rb +108 -108
- data/lib/sequence_logo/ytilib/iupac.rb +92 -92
- data/lib/sequence_logo/ytilib/pm.rb +562 -562
- data/lib/sequence_logo/ytilib/pmsd.rb +98 -98
- data/lib/sequence_logo/ytilib/ppm_support.rb +85 -85
- data/lib/sequence_logo/ytilib/randoom.rb +131 -131
- data/lib/sequence_logo/ytilib/ytilib.rb +146 -146
- data/lib/sequence_logo/ytilib.rb +9 -9
- data/lib/sequence_logo.rb +7 -7
- data/sequence_logo.gemspec +21 -21
- data/test/data/pcm/AHR_si.pcm +10 -10
- data/test/data/pcm/AIRE_f2.pcm +19 -19
- metadata +3 -4
| @@ -1,562 +1,562 @@ | |
| 1 | 
            -
            module Ytilib
         | 
| 2 | 
            -
              class PM
         | 
| 3 | 
            -
             | 
| 4 | 
            -
                attr_reader :matrix, :size
         | 
| 5 | 
            -
                attr_accessor :words_count
         | 
| 6 | 
            -
                
         | 
| 7 | 
            -
                alias length size
         | 
| 8 | 
            -
                
         | 
| 9 | 
            -
                def score_mean(bckgr = Randoom::DEF_PROBS)
         | 
| 10 | 
            -
                  (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
         | 
| 11 | 
            -
                end
         | 
| 12 | 
            -
                
         | 
| 13 | 
            -
                def score_variance(bckgr = Randoom::DEF_PROBS)
         | 
| 14 | 
            -
                  (0...@size).inject(0.0) { |m2, i| 
         | 
| 15 | 
            -
                    deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
         | 
| 16 | 
            -
                    m2 += deltai
         | 
| 17 | 
            -
                  }
         | 
| 18 | 
            -
                end
         | 
| 19 | 
            -
                
         | 
| 20 | 
            -
                def p_value(threshold, mean = nil, variance = nil)
         | 
| 21 | 
            -
                  mean = mean ? mean : score_mean
         | 
| 22 | 
            -
                  variance = variance ? variance : score_variance
         | 
| 23 | 
            -
                  n_ = (threshold - mean) / Math.sqrt(variance)
         | 
| 24 | 
            -
                  p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
         | 
| 25 | 
            -
                end
         | 
| 26 | 
            -
                
         | 
| 27 | 
            -
                def best_word
         | 
| 28 | 
            -
                  return (0...size).inject("") { |word, i|
         | 
| 29 | 
            -
                    max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 30 | 
            -
                    maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
         | 
| 31 | 
            -
                    word << (maxlets.size == 1 ? maxlets.first : "N")
         | 
| 32 | 
            -
                  }
         | 
| 33 | 
            -
                end
         | 
| 34 | 
            -
                
         | 
| 35 | 
            -
                def strict_consensus
         | 
| 36 | 
            -
                  return IUPAC.new((0...size).inject("") { |word, i|
         | 
| 37 | 
            -
                    max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 38 | 
            -
                    maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
         | 
| 39 | 
            -
                    word += IUPAC::CODE[maxlets]
         | 
| 40 | 
            -
                  })
         | 
| 41 | 
            -
                end
         | 
| 42 | 
            -
                
         | 
| 43 | 
            -
                def consensus_string(beautiful = false)
         | 
| 44 | 
            -
                  checkerr("words count is undefined") { !@words_count }
         | 
| 45 | 
            -
                  i2o4, thc, tlc = icd2of4, icdThc, icdTlc
         | 
| 46 | 
            -
                  icd = infocod
         | 
| 47 | 
            -
                  
         | 
| 48 | 
            -
                  return String.new((0...size).inject("") { |word, i|
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                    scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
         | 
| 51 | 
            -
                    
         | 
| 52 | 
            -
                    if icd[i] > i2o4
         | 
| 53 | 
            -
                      scores = [scores.first]
         | 
| 54 | 
            -
                    elsif icd[i] > thc
         | 
| 55 | 
            -
                      scores = scores[0..1]
         | 
| 56 | 
            -
                    elsif icd[i] > tlc
         | 
| 57 | 
            -
                      scores = scores[0..2]
         | 
| 58 | 
            -
                    end
         | 
| 59 | 
            -
                    
         | 
| 60 | 
            -
                    lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
         | 
| 61 | 
            -
                    
         | 
| 62 | 
            -
                    reslet = IUPAC::CODE[lets]
         | 
| 63 | 
            -
                    reslet = reslet.downcase if beautiful && lets.size > 2
         | 
| 64 | 
            -
                    
         | 
| 65 | 
            -
                    word += reslet
         | 
| 66 | 
            -
                  })
         | 
| 67 | 
            -
                end
         | 
| 68 | 
            -
                
         | 
| 69 | 
            -
                def consensus
         | 
| 70 | 
            -
                  checkerr("words count is undefined") { !@words_count }
         | 
| 71 | 
            -
                  i2o4, thc, tlc = icd2of4, icdThc, icdTlc
         | 
| 72 | 
            -
                  icd = infocod
         | 
| 73 | 
            -
                  
         | 
| 74 | 
            -
                  return IUPAC.new((0...size).inject("") { |word, i|
         | 
| 75 | 
            -
             | 
| 76 | 
            -
                    scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
         | 
| 77 | 
            -
                    
         | 
| 78 | 
            -
                    if icd[i] > i2o4
         | 
| 79 | 
            -
                      scores = [scores.first]
         | 
| 80 | 
            -
                    elsif icd[i] > thc
         | 
| 81 | 
            -
                      scores = scores[0..1]
         | 
| 82 | 
            -
                    elsif icd[i] > tlc
         | 
| 83 | 
            -
                      scores = scores[0..2]
         | 
| 84 | 
            -
                    end
         | 
| 85 | 
            -
                    
         | 
| 86 | 
            -
                    lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
         | 
| 87 | 
            -
                    
         | 
| 88 | 
            -
                    word += IUPAC::CODE[lets]
         | 
| 89 | 
            -
                  })
         | 
| 90 | 
            -
                end
         | 
| 91 | 
            -
                
         | 
| 92 | 
            -
                def find_hit(s, score_g, use2strands = true)
         | 
| 93 | 
            -
                  (0..(s.size - @size)).each { |i|
         | 
| 94 | 
            -
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 95 | 
            -
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 96 | 
            -
                    r = use2strands ? [score_p,score_rc].max : score_p
         | 
| 97 | 
            -
                    return i if r >= score_g
         | 
| 98 | 
            -
                  }
         | 
| 99 | 
            -
                  return nil
         | 
| 100 | 
            -
                end
         | 
| 101 | 
            -
                
         | 
| 102 | 
            -
                def find_hits(s, score_g, use2strands = true)
         | 
| 103 | 
            -
                  (0..(s.size - @size)).select { |i|
         | 
| 104 | 
            -
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 105 | 
            -
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 106 | 
            -
                    r = use2strands ? [score_p,score_rc].max : score_p
         | 
| 107 | 
            -
                    r >= score_g ? i : nil
         | 
| 108 | 
            -
                  }.compact
         | 
| 109 | 
            -
                end
         | 
| 110 | 
            -
                
         | 
| 111 | 
            -
                def collect_hits(s, score_g, use2strands = true)
         | 
| 112 | 
            -
                  result = []
         | 
| 113 | 
            -
                  (0..(s.size - @size)).each { |i|
         | 
| 114 | 
            -
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 115 | 
            -
                    score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
         | 
| 116 | 
            -
                    result << [score_p, seq, false, i] if score_p >= score_g
         | 
| 117 | 
            -
                    result << [score_rc, seq_rc, true, i] if score_rc >= score_g
         | 
| 118 | 
            -
                  }
         | 
| 119 | 
            -
                  result
         | 
| 120 | 
            -
                end
         | 
| 121 | 
            -
                
         | 
| 122 | 
            -
                def best_hit(s, use2strands = true)
         | 
| 123 | 
            -
                  
         | 
| 124 | 
            -
                  checkerr("too short sequence") { s.size < @size }
         | 
| 125 | 
            -
                  return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
         | 
| 126 | 
            -
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 127 | 
            -
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 128 | 
            -
                    r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
         | 
| 129 | 
            -
                  }
         | 
| 130 | 
            -
                end
         | 
| 131 | 
            -
                
         | 
| 132 | 
            -
                def eql?(pm)
         | 
| 133 | 
            -
                  return ['A','C','G','T'].inject(true) { |equal, letter|
         | 
| 134 | 
            -
                    equal = equal && @matrix[letter].eql?(pm.matrix[letter])
         | 
| 135 | 
            -
                  }
         | 
| 136 | 
            -
                end
         | 
| 137 | 
            -
                
         | 
| 138 | 
            -
                def flexeql?(pm)
         | 
| 139 | 
            -
                  checkerr("for what?") { true }
         | 
| 140 | 
            -
                  return ['A','C','G','T'].inject(true) { |equal, letter|
         | 
| 141 | 
            -
                    # report "letter=#{letter}"
         | 
| 142 | 
            -
                    equal = equal && (0...@size).inject(true) { |deepequal, position| 
         | 
| 143 | 
            -
                      # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
         | 
| 144 | 
            -
                      deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11 
         | 
| 145 | 
            -
                    }
         | 
| 146 | 
            -
                  }
         | 
| 147 | 
            -
                end
         | 
| 148 | 
            -
                
         | 
| 149 | 
            -
                def initialize(size, matrix = nil, words_count = nil)
         | 
| 150 | 
            -
                  checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
         | 
| 151 | 
            -
                  @size = size
         | 
| 152 | 
            -
                  @matrix = matrix == nil ? PM.new_matrix(size) : matrix
         | 
| 153 | 
            -
                  if !words_count || words_count <= 0
         | 
| 154 | 
            -
                    words_count = col_sum(0)
         | 
| 155 | 
            -
                    @words_count = words_count.round >= 2 ? words_count.round : nil
         | 
| 156 | 
            -
                  else
         | 
| 157 | 
            -
                    @words_count = words_count
         | 
| 158 | 
            -
                  end
         | 
| 159 | 
            -
                end
         | 
| 160 | 
            -
                
         | 
| 161 | 
            -
                def col_sum(index = 0, letset = ['A','C','G','T'])
         | 
| 162 | 
            -
                  return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
         | 
| 163 | 
            -
                end
         | 
| 164 | 
            -
                
         | 
| 165 | 
            -
                def PM.col_sum(matrix, index = 0)
         | 
| 166 | 
            -
                  return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
         | 
| 167 | 
            -
                end
         | 
| 168 | 
            -
                  
         | 
| 169 | 
            -
                def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
         | 
| 170 | 
            -
                  @words_count = words_count if words_count && words_count > 0
         | 
| 171 | 
            -
                  
         | 
| 172 | 
            -
                  @matrix.each_key do |letter|
         | 
| 173 | 
            -
                    (0...@size).each { |pos|
         | 
| 174 | 
            -
                      
         | 
| 175 | 
            -
                      #p "pcm"
         | 
| 176 | 
            -
                      #p @matrix[letter][pos]
         | 
| 177 | 
            -
                      #p @matrix[letter][pos] + (probs[letter] * pseudocount)
         | 
| 178 | 
            -
                      #p ( (@words_count + pseudocount) * probs[letter])
         | 
| 179 | 
            -
                      #exit
         | 
| 180 | 
            -
                      
         | 
| 181 | 
            -
                      @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
         | 
| 182 | 
            -
                      
         | 
| 183 | 
            -
                    }
         | 
| 184 | 
            -
                  end
         | 
| 185 | 
            -
                  
         | 
| 186 | 
            -
                  return self
         | 
| 187 | 
            -
                end
         | 
| 188 | 
            -
                
         | 
| 189 | 
            -
                def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
         | 
| 190 | 
            -
                  return self.dup.to_pwm!(words_count, probs, pseudocount)
         | 
| 191 | 
            -
                end
         | 
| 192 | 
            -
                alias to_pwm get_pwm
         | 
| 193 | 
            -
                
         | 
| 194 | 
            -
                def get_ppm(words_count = nil)
         | 
| 195 | 
            -
                  words_count = @words_count unless words_count
         | 
| 196 | 
            -
                  checkerr("undefined words count") { !words_count || words_count <= 0 }
         | 
| 197 | 
            -
                  ppm = @matrix['N'] ?  PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
         | 
| 198 | 
            -
                  @matrix.each_key { |letter|
         | 
| 199 | 
            -
                    (0...@size).each { |i|
         | 
| 200 | 
            -
                      ppm[letter][i] = @matrix[letter][i].to_f / words_count
         | 
| 201 | 
            -
                    }
         | 
| 202 | 
            -
                  }
         | 
| 203 | 
            -
                  return PPM.new(@size, ppm, words_count)
         | 
| 204 | 
            -
                end
         | 
| 205 | 
            -
                alias to_ppm get_ppm
         | 
| 206 | 
            -
                
         | 
| 207 | 
            -
                def score(word)
         | 
| 208 | 
            -
                  checkerr("word size != pwm.size") { @size != word.size }
         | 
| 209 | 
            -
                  checkerr("word #{word} has strange characters") { 
         | 
| 210 | 
            -
                    @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
         | 
| 211 | 
            -
                  }
         | 
| 212 | 
            -
                  return (0...@size).inject(0) { |sum, i| 
         | 
| 213 | 
            -
                    sum += @matrix[word[i,1]][i]
         | 
| 214 | 
            -
                  }
         | 
| 215 | 
            -
                end
         | 
| 216 | 
            -
                
         | 
| 217 | 
            -
                def best_score
         | 
| 218 | 
            -
                  return (0...size).inject(0) { |sum, i|
         | 
| 219 | 
            -
                    sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 220 | 
            -
                  }
         | 
| 221 | 
            -
                end
         | 
| 222 | 
            -
                
         | 
| 223 | 
            -
                def worst_score
         | 
| 224 | 
            -
                  return (0...size).inject(0) { |sum, i|
         | 
| 225 | 
            -
                    sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
         | 
| 226 | 
            -
                  }
         | 
| 227 | 
            -
                end
         | 
| 228 | 
            -
                
         | 
| 229 | 
            -
                def dup
         | 
| 230 | 
            -
                  new_matrix = {}
         | 
| 231 | 
            -
                  @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
         | 
| 232 | 
            -
                  return PM.new(@size, new_matrix, @words_count)
         | 
| 233 | 
            -
                end
         | 
| 234 | 
            -
                
         | 
| 235 | 
            -
                def PM.new_pcm(words, iupacomp = false)
         | 
| 236 | 
            -
                  size = words[0].size
         | 
| 237 | 
            -
                  counts = PM.new_matrix(size)
         | 
| 238 | 
            -
                  counts.each_value { |arr| arr.fill(0) }
         | 
| 239 | 
            -
                  words.each { |word|
         | 
| 240 | 
            -
                    0.upto(size-1) { |i|
         | 
| 241 | 
            -
                      letter = word[i,1].upcase
         | 
| 242 | 
            -
                      checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
         | 
| 243 | 
            -
                      if letter != 'N'
         | 
| 244 | 
            -
                        counts[letter][i] += 1
         | 
| 245 | 
            -
                      else
         | 
| 246 | 
            -
                        ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
         | 
| 247 | 
            -
                      end
         | 
| 248 | 
            -
                    }
         | 
| 249 | 
            -
                  }
         | 
| 250 | 
            -
                  newpcm = PM.new(size, counts, words.size)
         | 
| 251 | 
            -
                  newpcm.iupacomp! if iupacomp
         | 
| 252 | 
            -
                  return newpcm
         | 
| 253 | 
            -
                end
         | 
| 254 | 
            -
                
         | 
| 255 | 
            -
                def PM.new_pwm(words)
         | 
| 256 | 
            -
                  pcm = PM.new_pcm(words)
         | 
| 257 | 
            -
                  pcm.to_pwm!
         | 
| 258 | 
            -
                  return pcm
         | 
| 259 | 
            -
                end
         | 
| 260 | 
            -
                
         | 
| 261 | 
            -
                def PM.load(filename)
         | 
| 262 | 
            -
                  # supporting pat & pwm formats (letter-column and letter-row format)
         | 
| 263 | 
            -
                  input = IO.read(filename)
         | 
| 264 | 
            -
                  tm = []
         | 
| 265 | 
            -
                  input.each_line { |line| 
         | 
| 266 | 
            -
                    l_a = line.split
         | 
| 267 | 
            -
                    begin
         | 
| 268 | 
            -
                      l_a = l_a.collect { |a_i| Float(a_i) }
         | 
| 269 | 
            -
                    rescue
         | 
| 270 | 
            -
                      next
         | 
| 271 | 
            -
                    end
         | 
| 272 | 
            -
                    tm << l_a
         | 
| 273 | 
            -
                  }
         | 
| 274 | 
            -
                  tm = tm.transpose if tm.size == 4
         | 
| 275 | 
            -
                  matrix = PM.new_matrix(tm.size)
         | 
| 276 | 
            -
                  tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] }  }
         | 
| 277 | 
            -
                  
         | 
| 278 | 
            -
                  ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
         | 
| 279 | 
            -
                  
         | 
| 280 | 
            -
                  return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
         | 
| 281 | 
            -
                end
         | 
| 282 | 
            -
                
         | 
| 283 | 
            -
                def save(filename)
         | 
| 284 | 
            -
                  File.open(filename, "w") { |out_f|
         | 
| 285 | 
            -
                    case File.ext_wo_name(filename)
         | 
| 286 | 
            -
                    when "pwm"
         | 
| 287 | 
            -
                      ['A', 'C', 'G', 'T'].each { |letter|
         | 
| 288 | 
            -
                        @matrix[letter].each { |e|
         | 
| 289 | 
            -
                          out_f << "#{e} "
         | 
| 290 | 
            -
                        }
         | 
| 291 | 
            -
                        out_f << $/
         | 
| 292 | 
            -
                      }
         | 
| 293 | 
            -
                    when "pat"
         | 
| 294 | 
            -
                      out_f.puts File.name_wo_ext(filename)
         | 
| 295 | 
            -
                      (0...@size).each { |i|
         | 
| 296 | 
            -
                        ['A', 'C', 'G', 'T'].each { |letter|
         | 
| 297 | 
            -
                          out_f << "#{@matrix[letter][i]} "
         | 
| 298 | 
            -
                        }
         | 
| 299 | 
            -
                        out_f << $/
         | 
| 300 | 
            -
                      }
         | 
| 301 | 
            -
                    when "xml"
         | 
| 302 | 
            -
                      checkerr("small-BiSMark is not supported at this moment")
         | 
| 303 | 
            -
                    else
         | 
| 304 | 
            -
                      checkerr("unknown motif file format specified")
         | 
| 305 | 
            -
                    end
         | 
| 306 | 
            -
                  }
         | 
| 307 | 
            -
                end
         | 
| 308 | 
            -
                
         | 
| 309 | 
            -
                def positiv!
         | 
| 310 | 
            -
                  min = @matrix.values.collect { |v| v.min }.min.abs
         | 
| 311 | 
            -
                  @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
         | 
| 312 | 
            -
                  return self
         | 
| 313 | 
            -
                end
         | 
| 314 | 
            -
                
         | 
| 315 | 
            -
                def revcomp!
         | 
| 316 | 
            -
                  @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
         | 
| 317 | 
            -
                  @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
         | 
| 318 | 
            -
                  @matrix.each_value { |v| v.reverse! }
         | 
| 319 | 
            -
                  self
         | 
| 320 | 
            -
                end
         | 
| 321 | 
            -
                
         | 
| 322 | 
            -
                def to_bismark(b)
         | 
| 323 | 
            -
                  pwm = @matrix['A'][0].is_a?(Float)
         | 
| 324 | 
            -
                  attributes = {"length" => @size}
         | 
| 325 | 
            -
                  attributes["words-count"] = @words_count if @words_count && @words_count > 0
         | 
| 326 | 
            -
                  pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
         | 
| 327 | 
            -
                  (0...@matrix['A'].size).each { |i|
         | 
| 328 | 
            -
                    pm_c = pe.add_element("pm-column", {"position" => i+1})
         | 
| 329 | 
            -
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 330 | 
            -
                      pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
         | 
| 331 | 
            -
                    }
         | 
| 332 | 
            -
                  }
         | 
| 333 | 
            -
                end
         | 
| 334 | 
            -
                
         | 
| 335 | 
            -
                def PM.from_bismark(b, iupacomp = false)
         | 
| 336 | 
            -
                  
         | 
| 337 | 
            -
                  checkerr("empty small-BiSMark file?") { !b }
         | 
| 338 | 
            -
                  float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
         | 
| 339 | 
            -
                  words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
         | 
| 340 | 
            -
                  
         | 
| 341 | 
            -
                  matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
         | 
| 342 | 
            -
                  b.elements.each("pm-column") { |pmc|
         | 
| 343 | 
            -
                    position = pmc.attributes["position"].to_i
         | 
| 344 | 
            -
                    ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
         | 
| 345 | 
            -
                  }
         | 
| 346 | 
            -
                  if b.name == "PPM"
         | 
| 347 | 
            -
                    newppm = PPM.new(matrix['A'].size, matrix, words_count)
         | 
| 348 | 
            -
                    newppm.iupacomp! if iupacomp
         | 
| 349 | 
            -
                    return newppm
         | 
| 350 | 
            -
                  end
         | 
| 351 | 
            -
                  if b.name == "PCM"
         | 
| 352 | 
            -
                    @words_count = col_sum(matrix)
         | 
| 353 | 
            -
                    newpcm = PM.new(matrix['A'].size, matrix, words_count)
         | 
| 354 | 
            -
                    newpcm.iupacomp! if iupacomp
         | 
| 355 | 
            -
                    return newpcm
         | 
| 356 | 
            -
                  end
         | 
| 357 | 
            -
                  if b.name == "PWM" && iupacomp
         | 
| 358 | 
            -
                    raise "cannot force IUPAC compatible PWM"
         | 
| 359 | 
            -
                  end
         | 
| 360 | 
            -
                  return PM.new(matrix['A'].size, matrix, words_count)
         | 
| 361 | 
            -
                end
         | 
| 362 | 
            -
                
         | 
| 363 | 
            -
                IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
         | 
| 364 | 
            -
                def iupacomp!
         | 
| 365 | 
            -
                  @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
         | 
| 366 | 
            -
                  # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
         | 
| 367 | 
            -
                  
         | 
| 368 | 
            -
                  IUPAC_LS.each { |iul_ls|
         | 
| 369 | 
            -
                    @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
         | 
| 370 | 
            -
                  }
         | 
| 371 | 
            -
                  
         | 
| 372 | 
            -
                  return self
         | 
| 373 | 
            -
                end
         | 
| 374 | 
            -
                
         | 
| 375 | 
            -
                def m3sd(bckgr = Randoom::DEF_PROBS)
         | 
| 376 | 
            -
                
         | 
| 377 | 
            -
                  mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
         | 
| 378 | 
            -
                  dev = (0...@size).inject(0.0) { |m2, i| 
         | 
| 379 | 
            -
                    deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
         | 
| 380 | 
            -
                    m2 += deltai
         | 
| 381 | 
            -
                  }
         | 
| 382 | 
            -
                  sigma = Math.sqrt(dev)
         | 
| 383 | 
            -
                  
         | 
| 384 | 
            -
                  mean+3*sigma
         | 
| 385 | 
            -
                end
         | 
| 386 | 
            -
                
         | 
| 387 | 
            -
                def fixwc
         | 
| 388 | 
            -
                  return unless @words_count
         | 
| 389 | 
            -
                  @words_count = (0...@size).collect { |i| col_sum(i) }.max
         | 
| 390 | 
            -
                end
         | 
| 391 | 
            -
                
         | 
| 392 | 
            -
                protected
         | 
| 393 | 
            -
                def PM.new_matrix(size)
         | 
| 394 | 
            -
                  return {
         | 
| 395 | 
            -
                    'A' => Array.new(size),
         | 
| 396 | 
            -
                    'C' => Array.new(size),
         | 
| 397 | 
            -
                    'G' => Array.new(size),
         | 
| 398 | 
            -
                    'T' => Array.new(size) }
         | 
| 399 | 
            -
                end
         | 
| 400 | 
            -
                
         | 
| 401 | 
            -
                def PM.new_matrix_iupac(size)
         | 
| 402 | 
            -
                  return {
         | 
| 403 | 
            -
                    'A' => Array.new(size),
         | 
| 404 | 
            -
                    'C' => Array.new(size),
         | 
| 405 | 
            -
                    'G' => Array.new(size),
         | 
| 406 | 
            -
                    'T' => Array.new(size),
         | 
| 407 | 
            -
                    'R' => Array.new(size),
         | 
| 408 | 
            -
                    'Y' => Array.new(size),
         | 
| 409 | 
            -
                    'K' => Array.new(size),
         | 
| 410 | 
            -
                    'M' => Array.new(size),
         | 
| 411 | 
            -
                    'S' => Array.new(size),
         | 
| 412 | 
            -
                    'W' => Array.new(size),
         | 
| 413 | 
            -
                    'B' => Array.new(size),
         | 
| 414 | 
            -
                    'D' => Array.new(size),
         | 
| 415 | 
            -
                    'H' => Array.new(size),
         | 
| 416 | 
            -
                    'V' => Array.new(size),
         | 
| 417 | 
            -
                    'N' => Array.new(size)
         | 
| 418 | 
            -
                    }
         | 
| 419 | 
            -
                end
         | 
| 420 | 
            -
                
         | 
| 421 | 
            -
              end
         | 
| 422 | 
            -
             | 
| 423 | 
            -
              class PPM < PM
         | 
| 424 | 
            -
                
         | 
| 425 | 
            -
                #DEPRECATED, use iupacomp! instead
         | 
| 426 | 
            -
                #def make_N_comp!
         | 
| 427 | 
            -
                #  @matrix['N'] = (0...size).collect { 0.25 }
         | 
| 428 | 
            -
                #  return self
         | 
| 429 | 
            -
                #end
         | 
| 430 | 
            -
                
         | 
| 431 | 
            -
                def initialize(size, matrix = nil, words_count = nil)
         | 
| 432 | 
            -
                  checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
         | 
| 433 | 
            -
                  @size = size
         | 
| 434 | 
            -
                  @matrix = matrix == nil ? PM.new_matrix(size) : matrix
         | 
| 435 | 
            -
                  @words_count = words_count
         | 
| 436 | 
            -
                end
         | 
| 437 | 
            -
                
         | 
| 438 | 
            -
                def iupacomp!
         | 
| 439 | 
            -
                  @words_count = 4.0 unless @words_count
         | 
| 440 | 
            -
                  
         | 
| 441 | 
            -
                  IUPAC_LS.each { |iul_ls|
         | 
| 442 | 
            -
                    @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
         | 
| 443 | 
            -
                  }
         | 
| 444 | 
            -
                  
         | 
| 445 | 
            -
                  return self
         | 
| 446 | 
            -
                end
         | 
| 447 | 
            -
                
         | 
| 448 | 
            -
                def score(word)
         | 
| 449 | 
            -
                  checkerr("word size != ppm.size") { @size != word.size }
         | 
| 450 | 
            -
                  checkerr("word #{word} has strange characters") { 
         | 
| 451 | 
            -
                    @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
         | 
| 452 | 
            -
                  }
         | 
| 453 | 
            -
                  return (0...@size).inject(1) { |mul, i| 
         | 
| 454 | 
            -
                    mul *= @matrix[word[i,1]][i]
         | 
| 455 | 
            -
                  }
         | 
| 456 | 
            -
                end
         | 
| 457 | 
            -
                
         | 
| 458 | 
            -
                def best_score
         | 
| 459 | 
            -
                  return (0...size).inject(1) { |mul, i|
         | 
| 460 | 
            -
                    mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 461 | 
            -
                  }    
         | 
| 462 | 
            -
                end
         | 
| 463 | 
            -
                
         | 
| 464 | 
            -
                def worst_score
         | 
| 465 | 
            -
                  return (0...size).inject(0) { |mul, i|
         | 
| 466 | 
            -
                    mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
         | 
| 467 | 
            -
                  }
         | 
| 468 | 
            -
                end
         | 
| 469 | 
            -
                
         | 
| 470 | 
            -
                def to_bismark(b)
         | 
| 471 | 
            -
                  attributes = {"length" => @size}
         | 
| 472 | 
            -
                  attributes["words-count"] = @words_count if @words_count
         | 
| 473 | 
            -
                  pe = b.add_element("PPM", attributes)
         | 
| 474 | 
            -
                  (0...@matrix['A'].size).each { |i|
         | 
| 475 | 
            -
                    pm_c = pe.add_element("pm-column", {"position" => i+1})
         | 
| 476 | 
            -
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 477 | 
            -
                      pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
         | 
| 478 | 
            -
                    }
         | 
| 479 | 
            -
                  }
         | 
| 480 | 
            -
                end
         | 
| 481 | 
            -
                
         | 
| 482 | 
            -
                def PPM.probs2IUPAC!(probs)
         | 
| 483 | 
            -
                  IUPAC_LS.each { |iul_ls|
         | 
| 484 | 
            -
                    probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
         | 
| 485 | 
            -
                  }
         | 
| 486 | 
            -
                  return probs
         | 
| 487 | 
            -
                end
         | 
| 488 | 
            -
                
         | 
| 489 | 
            -
                def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
         | 
| 490 | 
            -
                  
         | 
| 491 | 
            -
                  probs = PPM.probs2IUPAC!(probs.dup)
         | 
| 492 | 
            -
                  
         | 
| 493 | 
            -
                  words_count = @words_count if !words_count || words_count == 0
         | 
| 494 | 
            -
                  checkerr("undefined words count") { !words_count }
         | 
| 495 | 
            -
                  
         | 
| 496 | 
            -
                  pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
         | 
| 497 | 
            -
                  
         | 
| 498 | 
            -
                  @matrix.each_key do |letter|
         | 
| 499 | 
            -
                    (0...@size).each { |pos|
         | 
| 500 | 
            -
                      
         | 
| 501 | 
            -
                      pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
         | 
| 502 | 
            -
                      
         | 
| 503 | 
            -
                    }
         | 
| 504 | 
            -
                  end
         | 
| 505 | 
            -
                  return PM.new(@size, pwm, words_count)
         | 
| 506 | 
            -
                  #pcm = get_pcm(words_count)
         | 
| 507 | 
            -
                  #pcm.iupacomp! if @matrix['N']
         | 
| 508 | 
            -
                  #return pcm.to_pwm!(words_count, probs, pseudocount)
         | 
| 509 | 
            -
                end
         | 
| 510 | 
            -
                alias to_pwm get_pwm
         | 
| 511 | 
            -
                
         | 
| 512 | 
            -
                def get_pwm0pc(probs = Randoom::DEF_PROBS)
         | 
| 513 | 
            -
                  new_matrix = {}
         | 
| 514 | 
            -
                  @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
         | 
| 515 | 
            -
                  newpm = PM.new(@size, new_matrix, nil)
         | 
| 516 | 
            -
                  
         | 
| 517 | 
            -
                  new_matrix.each_key do |letter|
         | 
| 518 | 
            -
                    (0...@size).each { |pos|
         | 
| 519 | 
            -
                      new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
         | 
| 520 | 
            -
                    }
         | 
| 521 | 
            -
                  end
         | 
| 522 | 
            -
                  
         | 
| 523 | 
            -
                  return newpm
         | 
| 524 | 
            -
                end
         | 
| 525 | 
            -
                
         | 
| 526 | 
            -
                def to_pwm!
         | 
| 527 | 
            -
                  raise "cannot force PPM class to PWM, use to_pwm instead"
         | 
| 528 | 
            -
                end
         | 
| 529 | 
            -
                
         | 
| 530 | 
            -
                def get_pcm(words_count = nil)
         | 
| 531 | 
            -
                  words_count = @words_count unless words_count
         | 
| 532 | 
            -
                  checkerr("undefined words count") { !words_count }
         | 
| 533 | 
            -
                  counts = PM.new_matrix(@size)
         | 
| 534 | 
            -
                  (0...size).each { |i|
         | 
| 535 | 
            -
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 536 | 
            -
                      counts[l][i] = @matrix[l][i] * words_count
         | 
| 537 | 
            -
                    }
         | 
| 538 | 
            -
                  }
         | 
| 539 | 
            -
                  newpcm = PM.new(size, counts, words_count).iupacomp!
         | 
| 540 | 
            -
                  return newpcm
         | 
| 541 | 
            -
                end
         | 
| 542 | 
            -
                alias to_pcm get_pcm
         | 
| 543 | 
            -
                
         | 
| 544 | 
            -
                def PPM.from_IUPAC(iupac)
         | 
| 545 | 
            -
                  matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
         | 
| 546 | 
            -
                  
         | 
| 547 | 
            -
                  (0...iupac.size).each { |i|
         | 
| 548 | 
            -
                    matrix.each_key { |k| matrix[k] << 0.0 }
         | 
| 549 | 
            -
                    letters = IUPAC::REVCODE[iupac[i]]
         | 
| 550 | 
            -
                    (0...letters.size).each { |j|
         | 
| 551 | 
            -
                      matrix[letters[j]][-1] = 1.0/letters.size
         | 
| 552 | 
            -
                    }
         | 
| 553 | 
            -
                  }
         | 
| 554 | 
            -
                  
         | 
| 555 | 
            -
                  newppm = PPM.new(iupac.size, matrix, 4.0)
         | 
| 556 | 
            -
                  newppm.iupacomp!
         | 
| 557 | 
            -
                  
         | 
| 558 | 
            -
                  newppm
         | 
| 559 | 
            -
                end
         | 
| 560 | 
            -
                
         | 
| 561 | 
            -
              end
         | 
| 562 | 
            -
            end
         | 
| 1 | 
            +
            module Ytilib
         | 
| 2 | 
            +
              class PM
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                attr_reader :matrix, :size
         | 
| 5 | 
            +
                attr_accessor :words_count
         | 
| 6 | 
            +
                
         | 
| 7 | 
            +
                alias length size
         | 
| 8 | 
            +
                
         | 
| 9 | 
            +
                def score_mean(bckgr = Randoom::DEF_PROBS)
         | 
| 10 | 
            +
                  (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
                
         | 
| 13 | 
            +
                def score_variance(bckgr = Randoom::DEF_PROBS)
         | 
| 14 | 
            +
                  (0...@size).inject(0.0) { |m2, i| 
         | 
| 15 | 
            +
                    deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
         | 
| 16 | 
            +
                    m2 += deltai
         | 
| 17 | 
            +
                  }
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
                
         | 
| 20 | 
            +
                def p_value(threshold, mean = nil, variance = nil)
         | 
| 21 | 
            +
                  mean = mean ? mean : score_mean
         | 
| 22 | 
            +
                  variance = variance ? variance : score_variance
         | 
| 23 | 
            +
                  n_ = (threshold - mean) / Math.sqrt(variance)
         | 
| 24 | 
            +
                  p_value = (1 - Math.erf2(n_/Math.sqrt(2))) / 2.0
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
                
         | 
| 27 | 
            +
                def best_word
         | 
| 28 | 
            +
                  return (0...size).inject("") { |word, i|
         | 
| 29 | 
            +
                    max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 30 | 
            +
                    maxlets = ['A', 'C', 'G', 'T'].select { |l| @matrix[l][i] == max }
         | 
| 31 | 
            +
                    word << (maxlets.size == 1 ? maxlets.first : "N")
         | 
| 32 | 
            +
                  }
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                def strict_consensus
         | 
| 36 | 
            +
                  return IUPAC.new((0...size).inject("") { |word, i|
         | 
| 37 | 
            +
                    max = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 38 | 
            +
                    maxlets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += @matrix[l][i] == max ? l : ""}
         | 
| 39 | 
            +
                    word += IUPAC::CODE[maxlets]
         | 
| 40 | 
            +
                  })
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
                
         | 
| 43 | 
            +
                def consensus_string(beautiful = false)
         | 
| 44 | 
            +
                  checkerr("words count is undefined") { !@words_count }
         | 
| 45 | 
            +
                  i2o4, thc, tlc = icd2of4, icdThc, icdTlc
         | 
| 46 | 
            +
                  icd = infocod
         | 
| 47 | 
            +
                  
         | 
| 48 | 
            +
                  return String.new((0...size).inject("") { |word, i|
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
         | 
| 51 | 
            +
                    
         | 
| 52 | 
            +
                    if icd[i] > i2o4
         | 
| 53 | 
            +
                      scores = [scores.first]
         | 
| 54 | 
            +
                    elsif icd[i] > thc
         | 
| 55 | 
            +
                      scores = scores[0..1]
         | 
| 56 | 
            +
                    elsif icd[i] > tlc
         | 
| 57 | 
            +
                      scores = scores[0..2]
         | 
| 58 | 
            +
                    end
         | 
| 59 | 
            +
                    
         | 
| 60 | 
            +
                    lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
         | 
| 61 | 
            +
                    
         | 
| 62 | 
            +
                    reslet = IUPAC::CODE[lets]
         | 
| 63 | 
            +
                    reslet = reslet.downcase if beautiful && lets.size > 2
         | 
| 64 | 
            +
                    
         | 
| 65 | 
            +
                    word += reslet
         | 
| 66 | 
            +
                  })
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
                
         | 
| 69 | 
            +
                def consensus
         | 
| 70 | 
            +
                  checkerr("words count is undefined") { !@words_count }
         | 
| 71 | 
            +
                  i2o4, thc, tlc = icd2of4, icdThc, icdTlc
         | 
| 72 | 
            +
                  icd = infocod
         | 
| 73 | 
            +
                  
         | 
| 74 | 
            +
                  return IUPAC.new((0...size).inject("") { |word, i|
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    scores = ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.uniq.sort.reverse
         | 
| 77 | 
            +
                    
         | 
| 78 | 
            +
                    if icd[i] > i2o4
         | 
| 79 | 
            +
                      scores = [scores.first]
         | 
| 80 | 
            +
                    elsif icd[i] > thc
         | 
| 81 | 
            +
                      scores = scores[0..1]
         | 
| 82 | 
            +
                    elsif icd[i] > tlc
         | 
| 83 | 
            +
                      scores = scores[0..2]
         | 
| 84 | 
            +
                    end
         | 
| 85 | 
            +
                    
         | 
| 86 | 
            +
                    lets = ['A', 'C', 'G', 'T'].inject("") { |lets, l| lets += scores.include?(@matrix[l][i]) ? l : ""}
         | 
| 87 | 
            +
                    
         | 
| 88 | 
            +
                    word += IUPAC::CODE[lets]
         | 
| 89 | 
            +
                  })
         | 
| 90 | 
            +
                end
         | 
| 91 | 
            +
                
         | 
| 92 | 
            +
                def find_hit(s, score_g, use2strands = true)
         | 
| 93 | 
            +
                  (0..(s.size - @size)).each { |i|
         | 
| 94 | 
            +
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 95 | 
            +
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 96 | 
            +
                    r = use2strands ? [score_p,score_rc].max : score_p
         | 
| 97 | 
            +
                    return i if r >= score_g
         | 
| 98 | 
            +
                  }
         | 
| 99 | 
            +
                  return nil
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
                
         | 
| 102 | 
            +
                def find_hits(s, score_g, use2strands = true)
         | 
| 103 | 
            +
                  (0..(s.size - @size)).select { |i|
         | 
| 104 | 
            +
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 105 | 
            +
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 106 | 
            +
                    r = use2strands ? [score_p,score_rc].max : score_p
         | 
| 107 | 
            +
                    r >= score_g ? i : nil
         | 
| 108 | 
            +
                  }.compact
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
                def collect_hits(s, score_g, use2strands = true)
         | 
| 112 | 
            +
                  result = []
         | 
| 113 | 
            +
                  (0..(s.size - @size)).each { |i|
         | 
| 114 | 
            +
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 115 | 
            +
                    score_p, score_rc = score(seq.upcase), score(seq_rc.upcase)
         | 
| 116 | 
            +
                    result << [score_p, seq, false, i] if score_p >= score_g
         | 
| 117 | 
            +
                    result << [score_rc, seq_rc, true, i] if score_rc >= score_g
         | 
| 118 | 
            +
                  }
         | 
| 119 | 
            +
                  result
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
                
         | 
| 122 | 
            +
                def best_hit(s, use2strands = true)
         | 
| 123 | 
            +
                  
         | 
| 124 | 
            +
                  checkerr("too short sequence") { s.size < @size }
         | 
| 125 | 
            +
                  return (0..(s.size - @size)).inject(-Float::MAX) { |r, i|
         | 
| 126 | 
            +
                    seq, seq_rc = s[i, @size], s[i, @size].revcomp!
         | 
| 127 | 
            +
                    score_p, score_rc = score(seq), score(seq_rc)
         | 
| 128 | 
            +
                    r = use2strands ? [r,score_p,score_rc].max : [r,score_p].max
         | 
| 129 | 
            +
                  }
         | 
| 130 | 
            +
                end
         | 
| 131 | 
            +
                
         | 
| 132 | 
            +
                def eql?(pm)
         | 
| 133 | 
            +
                  return ['A','C','G','T'].inject(true) { |equal, letter|
         | 
| 134 | 
            +
                    equal = equal && @matrix[letter].eql?(pm.matrix[letter])
         | 
| 135 | 
            +
                  }
         | 
| 136 | 
            +
                end
         | 
| 137 | 
            +
                
         | 
| 138 | 
            +
                def flexeql?(pm)
         | 
| 139 | 
            +
                  checkerr("for what?") { true }
         | 
| 140 | 
            +
                  return ['A','C','G','T'].inject(true) { |equal, letter|
         | 
| 141 | 
            +
                    # report "letter=#{letter}"
         | 
| 142 | 
            +
                    equal = equal && (0...@size).inject(true) { |deepequal, position| 
         | 
| 143 | 
            +
                      # report "position=#{position}, delta=#{@matrix[letter][position] - pm.matrix[letter][position]}"
         | 
| 144 | 
            +
                      deepequal = deepequal && (@matrix[letter][position] - pm.matrix[letter][position]).abs < 10**-11 
         | 
| 145 | 
            +
                    }
         | 
| 146 | 
            +
                  }
         | 
| 147 | 
            +
                end
         | 
| 148 | 
            +
                
         | 
| 149 | 
            +
                def initialize(size, matrix = nil, words_count = nil)
         | 
| 150 | 
            +
                  checkerr("matrix['A'].size != size, #{matrix['A'].size} != #{size}") { matrix != nil && size != matrix['A'].size }
         | 
| 151 | 
            +
                  @size = size
         | 
| 152 | 
            +
                  @matrix = matrix == nil ? PM.new_matrix(size) : matrix
         | 
| 153 | 
            +
                  if !words_count || words_count <= 0
         | 
| 154 | 
            +
                    words_count = col_sum(0)
         | 
| 155 | 
            +
                    @words_count = words_count.round >= 2 ? words_count.round : nil
         | 
| 156 | 
            +
                  else
         | 
| 157 | 
            +
                    @words_count = words_count
         | 
| 158 | 
            +
                  end
         | 
| 159 | 
            +
                end
         | 
| 160 | 
            +
                
         | 
| 161 | 
            +
                def col_sum(index = 0, letset = ['A','C','G','T'])
         | 
| 162 | 
            +
                  return letset.inject(0) { |sum, l| sum += @matrix[l][index] }
         | 
| 163 | 
            +
                end
         | 
| 164 | 
            +
                
         | 
| 165 | 
            +
                def PM.col_sum(matrix, index = 0)
         | 
| 166 | 
            +
                  return matrix['A'][index] + matrix['C'][index] + matrix['G'][index] + matrix['T'][index]
         | 
| 167 | 
            +
                end
         | 
| 168 | 
            +
                  
         | 
| 169 | 
            +
                def to_pwm!(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
         | 
| 170 | 
            +
                  @words_count = words_count if words_count && words_count > 0
         | 
| 171 | 
            +
                  
         | 
| 172 | 
            +
                  @matrix.each_key do |letter|
         | 
| 173 | 
            +
                    (0...@size).each { |pos|
         | 
| 174 | 
            +
                      
         | 
| 175 | 
            +
                      #p "pcm"
         | 
| 176 | 
            +
                      #p @matrix[letter][pos]
         | 
| 177 | 
            +
                      #p @matrix[letter][pos] + (probs[letter] * pseudocount)
         | 
| 178 | 
            +
                      #p ( (@words_count + pseudocount) * probs[letter])
         | 
| 179 | 
            +
                      #exit
         | 
| 180 | 
            +
                      
         | 
| 181 | 
            +
                      @matrix[letter][pos] = Math::log( (@matrix[letter][pos] + (probs[letter] * pseudocount)) / ( (@words_count + pseudocount) * probs[letter]) )
         | 
| 182 | 
            +
                      
         | 
| 183 | 
            +
                    }
         | 
| 184 | 
            +
                  end
         | 
| 185 | 
            +
                  
         | 
| 186 | 
            +
                  return self
         | 
| 187 | 
            +
                end
         | 
| 188 | 
            +
                
         | 
| 189 | 
            +
                def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1)
         | 
| 190 | 
            +
                  return self.dup.to_pwm!(words_count, probs, pseudocount)
         | 
| 191 | 
            +
                end
         | 
| 192 | 
            +
                alias to_pwm get_pwm
         | 
| 193 | 
            +
                
         | 
| 194 | 
            +
                def get_ppm(words_count = nil)
         | 
| 195 | 
            +
                  words_count = @words_count unless words_count
         | 
| 196 | 
            +
                  checkerr("undefined words count") { !words_count || words_count <= 0 }
         | 
| 197 | 
            +
                  ppm = @matrix['N'] ?  PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
         | 
| 198 | 
            +
                  @matrix.each_key { |letter|
         | 
| 199 | 
            +
                    (0...@size).each { |i|
         | 
| 200 | 
            +
                      ppm[letter][i] = @matrix[letter][i].to_f / words_count
         | 
| 201 | 
            +
                    }
         | 
| 202 | 
            +
                  }
         | 
| 203 | 
            +
                  return PPM.new(@size, ppm, words_count)
         | 
| 204 | 
            +
                end
         | 
| 205 | 
            +
                alias to_ppm get_ppm
         | 
| 206 | 
            +
                
         | 
| 207 | 
            +
                def score(word)
         | 
| 208 | 
            +
                  checkerr("word size != pwm.size") { @size != word.size }
         | 
| 209 | 
            +
                  checkerr("word #{word} has strange characters") { 
         | 
| 210 | 
            +
                    @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
         | 
| 211 | 
            +
                  }
         | 
| 212 | 
            +
                  return (0...@size).inject(0) { |sum, i| 
         | 
| 213 | 
            +
                    sum += @matrix[word[i,1]][i]
         | 
| 214 | 
            +
                  }
         | 
| 215 | 
            +
                end
         | 
| 216 | 
            +
                
         | 
| 217 | 
            +
                def best_score
         | 
| 218 | 
            +
                  return (0...size).inject(0) { |sum, i|
         | 
| 219 | 
            +
                    sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 220 | 
            +
                  }
         | 
| 221 | 
            +
                end
         | 
| 222 | 
            +
                
         | 
| 223 | 
            +
                def worst_score
         | 
| 224 | 
            +
                  return (0...size).inject(0) { |sum, i|
         | 
| 225 | 
            +
                    sum += ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
         | 
| 226 | 
            +
                  }
         | 
| 227 | 
            +
                end
         | 
| 228 | 
            +
                
         | 
| 229 | 
            +
                def dup
         | 
| 230 | 
            +
                  new_matrix = {}
         | 
| 231 | 
            +
                  @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
         | 
| 232 | 
            +
                  return PM.new(@size, new_matrix, @words_count)
         | 
| 233 | 
            +
                end
         | 
| 234 | 
            +
                
         | 
| 235 | 
            +
                def PM.new_pcm(words, iupacomp = false)
         | 
| 236 | 
            +
                  size = words[0].size
         | 
| 237 | 
            +
                  counts = PM.new_matrix(size)
         | 
| 238 | 
            +
                  counts.each_value { |arr| arr.fill(0) }
         | 
| 239 | 
            +
                  words.each { |word|
         | 
| 240 | 
            +
                    0.upto(size-1) { |i|
         | 
| 241 | 
            +
                      letter = word[i,1].upcase
         | 
| 242 | 
            +
                      checkerr("unknown letter #{letter}") { !['A', 'C', 'G', 'T', 'N'].include?(letter) }
         | 
| 243 | 
            +
                      if letter != 'N'
         | 
| 244 | 
            +
                        counts[letter][i] += 1
         | 
| 245 | 
            +
                      else
         | 
| 246 | 
            +
                        ['A', 'C', 'G', 'T'].each { |l| counts[l][i] += 0.25 }
         | 
| 247 | 
            +
                      end
         | 
| 248 | 
            +
                    }
         | 
| 249 | 
            +
                  }
         | 
| 250 | 
            +
                  newpcm = PM.new(size, counts, words.size)
         | 
| 251 | 
            +
                  newpcm.iupacomp! if iupacomp
         | 
| 252 | 
            +
                  return newpcm
         | 
| 253 | 
            +
                end
         | 
| 254 | 
            +
                
         | 
| 255 | 
            +
                def PM.new_pwm(words)
         | 
| 256 | 
            +
                  pcm = PM.new_pcm(words)
         | 
| 257 | 
            +
                  pcm.to_pwm!
         | 
| 258 | 
            +
                  return pcm
         | 
| 259 | 
            +
                end
         | 
| 260 | 
            +
                
         | 
| 261 | 
            +
                def PM.load(filename)
         | 
| 262 | 
            +
                  # supporting pat & pwm formats (letter-column and letter-row format)
         | 
| 263 | 
            +
                  input = IO.read(filename)
         | 
| 264 | 
            +
                  tm = []
         | 
| 265 | 
            +
                  input.each_line { |line| 
         | 
| 266 | 
            +
                    l_a = line.split
         | 
| 267 | 
            +
                    begin
         | 
| 268 | 
            +
                      l_a = l_a.collect { |a_i| Float(a_i) }
         | 
| 269 | 
            +
                    rescue
         | 
| 270 | 
            +
                      next
         | 
| 271 | 
            +
                    end
         | 
| 272 | 
            +
                    tm << l_a
         | 
| 273 | 
            +
                  }
         | 
| 274 | 
            +
                  tm = tm.transpose if tm.size == 4
         | 
| 275 | 
            +
                  matrix = PM.new_matrix(tm.size)
         | 
| 276 | 
            +
                  tm.each_index { |i| ['A', 'C', 'G', 'T'].each_with_index { |l, j| matrix[l][i] = tm[i][j] }  }
         | 
| 277 | 
            +
                  
         | 
| 278 | 
            +
                  ppm_mode = (0...tm.size).inject(true) { |ppm_ya, i| ppm_ya &= col_sum(matrix, i).round == 1 }
         | 
| 279 | 
            +
                  
         | 
| 280 | 
            +
                  return ppm_mode ? PPM.new(tm.size, matrix) : PM.new(tm.size, matrix)
         | 
| 281 | 
            +
                end
         | 
| 282 | 
            +
                
         | 
| 283 | 
            +
                def save(filename)
         | 
| 284 | 
            +
                  File.open(filename, "w") { |out_f|
         | 
| 285 | 
            +
                    case File.ext_wo_name(filename)
         | 
| 286 | 
            +
                    when "pwm"
         | 
| 287 | 
            +
                      ['A', 'C', 'G', 'T'].each { |letter|
         | 
| 288 | 
            +
                        @matrix[letter].each { |e|
         | 
| 289 | 
            +
                          out_f << "#{e} "
         | 
| 290 | 
            +
                        }
         | 
| 291 | 
            +
                        out_f << $/
         | 
| 292 | 
            +
                      }
         | 
| 293 | 
            +
                    when "pat"
         | 
| 294 | 
            +
                      out_f.puts File.name_wo_ext(filename)
         | 
| 295 | 
            +
                      (0...@size).each { |i|
         | 
| 296 | 
            +
                        ['A', 'C', 'G', 'T'].each { |letter|
         | 
| 297 | 
            +
                          out_f << "#{@matrix[letter][i]} "
         | 
| 298 | 
            +
                        }
         | 
| 299 | 
            +
                        out_f << $/
         | 
| 300 | 
            +
                      }
         | 
| 301 | 
            +
                    when "xml"
         | 
| 302 | 
            +
                      checkerr("small-BiSMark is not supported at this moment")
         | 
| 303 | 
            +
                    else
         | 
| 304 | 
            +
                      checkerr("unknown motif file format specified")
         | 
| 305 | 
            +
                    end
         | 
| 306 | 
            +
                  }
         | 
| 307 | 
            +
                end
         | 
| 308 | 
            +
                
         | 
| 309 | 
            +
                def positiv!
         | 
| 310 | 
            +
                  min = @matrix.values.collect { |v| v.min }.min.abs
         | 
| 311 | 
            +
                  @matrix.each_value { |v| (0...v.size).each { |i| v[i] += min } }
         | 
| 312 | 
            +
                  return self
         | 
| 313 | 
            +
                end
         | 
| 314 | 
            +
                
         | 
| 315 | 
            +
                def revcomp!
         | 
| 316 | 
            +
                  @matrix['A'], @matrix['T'] = @matrix['T'], @matrix['A']
         | 
| 317 | 
            +
                  @matrix['C'], @matrix['G'] = @matrix['G'], @matrix['C']
         | 
| 318 | 
            +
                  @matrix.each_value { |v| v.reverse! }
         | 
| 319 | 
            +
                  self
         | 
| 320 | 
            +
                end
         | 
| 321 | 
            +
                
         | 
| 322 | 
            +
                def to_bismark(b)
         | 
| 323 | 
            +
                  pwm = @matrix['A'][0].is_a?(Float)
         | 
| 324 | 
            +
                  attributes = {"length" => @size}
         | 
| 325 | 
            +
                  attributes["words-count"] = @words_count if @words_count && @words_count > 0
         | 
| 326 | 
            +
                  pe = b.add_element( pwm ? "PWM" : "PCM", attributes )
         | 
| 327 | 
            +
                  (0...@matrix['A'].size).each { |i|
         | 
| 328 | 
            +
                    pm_c = pe.add_element("pm-column", {"position" => i+1})
         | 
| 329 | 
            +
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 330 | 
            +
                      pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
         | 
| 331 | 
            +
                    }
         | 
| 332 | 
            +
                  }
         | 
| 333 | 
            +
                end
         | 
| 334 | 
            +
                
         | 
| 335 | 
            +
                def PM.from_bismark(b, iupacomp = false)
         | 
| 336 | 
            +
                  
         | 
| 337 | 
            +
                  checkerr("empty small-BiSMark file?") { !b }
         | 
| 338 | 
            +
                  float_m = (b.name == "PPM" || b.name == "PWM" || b.name == "WPCM")
         | 
| 339 | 
            +
                  words_count = b.attributes["words-count"] ? b.attributes["words-count"].to_f : nil
         | 
| 340 | 
            +
                  
         | 
| 341 | 
            +
                  matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
         | 
| 342 | 
            +
                  b.elements.each("pm-column") { |pmc|
         | 
| 343 | 
            +
                    position = pmc.attributes["position"].to_i
         | 
| 344 | 
            +
                    ['A', 'C', 'G', 'T'].each { |l| matrix[l][position-1] = float_m ? pmc.elements[l.downcase].get_text.to_s.to_f : pmc.elements[l.downcase].get_text.to_s.to_i }
         | 
| 345 | 
            +
                  }
         | 
| 346 | 
            +
                  if b.name == "PPM"
         | 
| 347 | 
            +
                    newppm = PPM.new(matrix['A'].size, matrix, words_count)
         | 
| 348 | 
            +
                    newppm.iupacomp! if iupacomp
         | 
| 349 | 
            +
                    return newppm
         | 
| 350 | 
            +
                  end
         | 
| 351 | 
            +
                  if b.name == "PCM"
         | 
| 352 | 
            +
                    @words_count = col_sum(matrix)
         | 
| 353 | 
            +
                    newpcm = PM.new(matrix['A'].size, matrix, words_count)
         | 
| 354 | 
            +
                    newpcm.iupacomp! if iupacomp
         | 
| 355 | 
            +
                    return newpcm
         | 
| 356 | 
            +
                  end
         | 
| 357 | 
            +
                  if b.name == "PWM" && iupacomp
         | 
| 358 | 
            +
                    raise "cannot force IUPAC compatible PWM"
         | 
| 359 | 
            +
                  end
         | 
| 360 | 
            +
                  return PM.new(matrix['A'].size, matrix, words_count)
         | 
| 361 | 
            +
                end
         | 
| 362 | 
            +
                
         | 
| 363 | 
            +
                IUPAC_LS = (IUPAC::CODE.keys - ['A','C','G','T']).collect { |iul| [IUPAC::CODE[iul], iul.split(//)] }
         | 
| 364 | 
            +
                def iupacomp!
         | 
| 365 | 
            +
                  @words_count = (0...@size).collect { |i| col_sum(i) }.max unless @words_count # for unbalanced matrices (Genomatix has some)
         | 
| 366 | 
            +
                  # @words_count = @words_count.round < 2.0 ? nil : @words_count.round
         | 
| 367 | 
            +
                  
         | 
| 368 | 
            +
                  IUPAC_LS.each { |iul_ls|
         | 
| 369 | 
            +
                    @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
         | 
| 370 | 
            +
                  }
         | 
| 371 | 
            +
                  
         | 
| 372 | 
            +
                  return self
         | 
| 373 | 
            +
                end
         | 
| 374 | 
            +
                
         | 
| 375 | 
            +
                def m3sd(bckgr = Randoom::DEF_PROBS)
         | 
| 376 | 
            +
                
         | 
| 377 | 
            +
                  mean = (0...@size).inject(0.0) { |mean, i| mean += ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i] * bckgr[l] } }
         | 
| 378 | 
            +
                  dev = (0...@size).inject(0.0) { |m2, i| 
         | 
| 379 | 
            +
                    deltai = ['A','C','G','T'].inject(0.0) { |sum,l| sum += @matrix[l][i]**2 * bckgr[l] } - ['A','C','G','T'].inject(0.0) { |sum,l| sum += matrix[l][i] * bckgr[l] }**2
         | 
| 380 | 
            +
                    m2 += deltai
         | 
| 381 | 
            +
                  }
         | 
| 382 | 
            +
                  sigma = Math.sqrt(dev)
         | 
| 383 | 
            +
                  
         | 
| 384 | 
            +
                  mean+3*sigma
         | 
| 385 | 
            +
                end
         | 
| 386 | 
            +
                
         | 
| 387 | 
            +
                def fixwc
         | 
| 388 | 
            +
                  return unless @words_count
         | 
| 389 | 
            +
                  @words_count = (0...@size).collect { |i| col_sum(i) }.max
         | 
| 390 | 
            +
                end
         | 
| 391 | 
            +
                
         | 
| 392 | 
            +
                protected
         | 
| 393 | 
            +
                def PM.new_matrix(size)
         | 
| 394 | 
            +
                  return {
         | 
| 395 | 
            +
                    'A' => Array.new(size),
         | 
| 396 | 
            +
                    'C' => Array.new(size),
         | 
| 397 | 
            +
                    'G' => Array.new(size),
         | 
| 398 | 
            +
                    'T' => Array.new(size) }
         | 
| 399 | 
            +
                end
         | 
| 400 | 
            +
                
         | 
| 401 | 
            +
                def PM.new_matrix_iupac(size)
         | 
| 402 | 
            +
                  return {
         | 
| 403 | 
            +
                    'A' => Array.new(size),
         | 
| 404 | 
            +
                    'C' => Array.new(size),
         | 
| 405 | 
            +
                    'G' => Array.new(size),
         | 
| 406 | 
            +
                    'T' => Array.new(size),
         | 
| 407 | 
            +
                    'R' => Array.new(size),
         | 
| 408 | 
            +
                    'Y' => Array.new(size),
         | 
| 409 | 
            +
                    'K' => Array.new(size),
         | 
| 410 | 
            +
                    'M' => Array.new(size),
         | 
| 411 | 
            +
                    'S' => Array.new(size),
         | 
| 412 | 
            +
                    'W' => Array.new(size),
         | 
| 413 | 
            +
                    'B' => Array.new(size),
         | 
| 414 | 
            +
                    'D' => Array.new(size),
         | 
| 415 | 
            +
                    'H' => Array.new(size),
         | 
| 416 | 
            +
                    'V' => Array.new(size),
         | 
| 417 | 
            +
                    'N' => Array.new(size)
         | 
| 418 | 
            +
                    }
         | 
| 419 | 
            +
                end
         | 
| 420 | 
            +
                
         | 
| 421 | 
            +
              end
         | 
| 422 | 
            +
             | 
| 423 | 
            +
              class PPM < PM
         | 
| 424 | 
            +
                
         | 
| 425 | 
            +
                #DEPRECATED, use iupacomp! instead
         | 
| 426 | 
            +
                #def make_N_comp!
         | 
| 427 | 
            +
                #  @matrix['N'] = (0...size).collect { 0.25 }
         | 
| 428 | 
            +
                #  return self
         | 
| 429 | 
            +
                #end
         | 
| 430 | 
            +
                
         | 
| 431 | 
            +
                def initialize(size, matrix = nil, words_count = nil)
         | 
| 432 | 
            +
                  checkerr("matrix['A'].size != size") { matrix != nil && size != matrix['A'].size }
         | 
| 433 | 
            +
                  @size = size
         | 
| 434 | 
            +
                  @matrix = matrix == nil ? PM.new_matrix(size) : matrix
         | 
| 435 | 
            +
                  @words_count = words_count
         | 
| 436 | 
            +
                end
         | 
| 437 | 
            +
                
         | 
| 438 | 
            +
                def iupacomp!
         | 
| 439 | 
            +
                  @words_count = 4.0 unless @words_count
         | 
| 440 | 
            +
                  
         | 
| 441 | 
            +
                  IUPAC_LS.each { |iul_ls|
         | 
| 442 | 
            +
                    @matrix[iul_ls[0]] = (0...@size).collect { |i| col_sum(i, iul_ls[1]) / iul_ls[1].size }
         | 
| 443 | 
            +
                  }
         | 
| 444 | 
            +
                  
         | 
| 445 | 
            +
                  return self
         | 
| 446 | 
            +
                end
         | 
| 447 | 
            +
                
         | 
| 448 | 
            +
                def score(word)
         | 
| 449 | 
            +
                  checkerr("word size != ppm.size") { @size != word.size }
         | 
| 450 | 
            +
                  checkerr("word #{word} has strange characters") { 
         | 
| 451 | 
            +
                    @matrix.keys.include?('N') ? word.tr('ACGTRYKMSWBDHVN', '').size > 0 : word.tr('ACGT', '').size > 0
         | 
| 452 | 
            +
                  }
         | 
| 453 | 
            +
                  return (0...@size).inject(1) { |mul, i| 
         | 
| 454 | 
            +
                    mul *= @matrix[word[i,1]][i]
         | 
| 455 | 
            +
                  }
         | 
| 456 | 
            +
                end
         | 
| 457 | 
            +
                
         | 
| 458 | 
            +
                def best_score
         | 
| 459 | 
            +
                  return (0...size).inject(1) { |mul, i|
         | 
| 460 | 
            +
                    mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.max
         | 
| 461 | 
            +
                  }    
         | 
| 462 | 
            +
                end
         | 
| 463 | 
            +
                
         | 
| 464 | 
            +
                def worst_score
         | 
| 465 | 
            +
                  return (0...size).inject(0) { |mul, i|
         | 
| 466 | 
            +
                    mul *= ['A', 'C', 'G', 'T'].collect { |l| @matrix[l][i] }.min
         | 
| 467 | 
            +
                  }
         | 
| 468 | 
            +
                end
         | 
| 469 | 
            +
                
         | 
| 470 | 
            +
                def to_bismark(b)
         | 
| 471 | 
            +
                  attributes = {"length" => @size}
         | 
| 472 | 
            +
                  attributes["words-count"] = @words_count if @words_count
         | 
| 473 | 
            +
                  pe = b.add_element("PPM", attributes)
         | 
| 474 | 
            +
                  (0...@matrix['A'].size).each { |i|
         | 
| 475 | 
            +
                    pm_c = pe.add_element("pm-column", {"position" => i+1})
         | 
| 476 | 
            +
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 477 | 
            +
                      pm_c.add_element(l.downcase).add_text(@matrix[l][i].to_s)
         | 
| 478 | 
            +
                    }
         | 
| 479 | 
            +
                  }
         | 
| 480 | 
            +
                end
         | 
| 481 | 
            +
                
         | 
| 482 | 
            +
                def PPM.probs2IUPAC!(probs)
         | 
| 483 | 
            +
                  IUPAC_LS.each { |iul_ls|
         | 
| 484 | 
            +
                    probs[iul_ls[0]] = iul_ls[1].inject(0) { |sum, l| sum += probs[l] } / iul_ls[1].size
         | 
| 485 | 
            +
                  }
         | 
| 486 | 
            +
                  return probs
         | 
| 487 | 
            +
                end
         | 
| 488 | 
            +
                
         | 
| 489 | 
            +
                def get_pwm(words_count = nil, probs = Randoom::DEF_PROBS, pseudocount = 1.0)
         | 
| 490 | 
            +
                  
         | 
| 491 | 
            +
                  probs = PPM.probs2IUPAC!(probs.dup)
         | 
| 492 | 
            +
                  
         | 
| 493 | 
            +
                  words_count = @words_count if !words_count || words_count == 0
         | 
| 494 | 
            +
                  checkerr("undefined words count") { !words_count }
         | 
| 495 | 
            +
                  
         | 
| 496 | 
            +
                  pwm = @matrix['N'] ? PM.new_matrix_iupac(@size) : PM.new_matrix(@size)
         | 
| 497 | 
            +
                  
         | 
| 498 | 
            +
                  @matrix.each_key do |letter|
         | 
| 499 | 
            +
                    (0...@size).each { |pos|
         | 
| 500 | 
            +
                      
         | 
| 501 | 
            +
                      pwm[letter][pos] = Math::log( (@matrix[letter][pos] * words_count + (probs[letter] * pseudocount) ) / ( (words_count + pseudocount) * probs[letter]) )
         | 
| 502 | 
            +
                      
         | 
| 503 | 
            +
                    }
         | 
| 504 | 
            +
                  end
         | 
| 505 | 
            +
                  return PM.new(@size, pwm, words_count)
         | 
| 506 | 
            +
                  #pcm = get_pcm(words_count)
         | 
| 507 | 
            +
                  #pcm.iupacomp! if @matrix['N']
         | 
| 508 | 
            +
                  #return pcm.to_pwm!(words_count, probs, pseudocount)
         | 
| 509 | 
            +
                end
         | 
| 510 | 
            +
                alias to_pwm get_pwm
         | 
| 511 | 
            +
                
         | 
| 512 | 
            +
                def get_pwm0pc(probs = Randoom::DEF_PROBS)
         | 
| 513 | 
            +
                  new_matrix = {}
         | 
| 514 | 
            +
                  @matrix.each_key { |letter| new_matrix[letter] = @matrix[letter].dup }
         | 
| 515 | 
            +
                  newpm = PM.new(@size, new_matrix, nil)
         | 
| 516 | 
            +
                  
         | 
| 517 | 
            +
                  new_matrix.each_key do |letter|
         | 
| 518 | 
            +
                    (0...@size).each { |pos|
         | 
| 519 | 
            +
                      new_matrix[letter][pos] = Math::log(@matrix[letter][pos] / probs[letter])
         | 
| 520 | 
            +
                    }
         | 
| 521 | 
            +
                  end
         | 
| 522 | 
            +
                  
         | 
| 523 | 
            +
                  return newpm
         | 
| 524 | 
            +
                end
         | 
| 525 | 
            +
                
         | 
| 526 | 
            +
                def to_pwm!
         | 
| 527 | 
            +
                  raise "cannot force PPM class to PWM, use to_pwm instead"
         | 
| 528 | 
            +
                end
         | 
| 529 | 
            +
                
         | 
| 530 | 
            +
                def get_pcm(words_count = nil)
         | 
| 531 | 
            +
                  words_count = @words_count unless words_count
         | 
| 532 | 
            +
                  checkerr("undefined words count") { !words_count }
         | 
| 533 | 
            +
                  counts = PM.new_matrix(@size)
         | 
| 534 | 
            +
                  (0...size).each { |i|
         | 
| 535 | 
            +
                    ['A', 'C', 'G', 'T'].each { |l|
         | 
| 536 | 
            +
                      counts[l][i] = @matrix[l][i] * words_count
         | 
| 537 | 
            +
                    }
         | 
| 538 | 
            +
                  }
         | 
| 539 | 
            +
                  newpcm = PM.new(size, counts, words_count).iupacomp!
         | 
| 540 | 
            +
                  return newpcm
         | 
| 541 | 
            +
                end
         | 
| 542 | 
            +
                alias to_pcm get_pcm
         | 
| 543 | 
            +
                
         | 
| 544 | 
            +
                def PPM.from_IUPAC(iupac)
         | 
| 545 | 
            +
                  matrix = {"A" => [], "C" => [], "G" => [], "T" => []}
         | 
| 546 | 
            +
                  
         | 
| 547 | 
            +
                  (0...iupac.size).each { |i|
         | 
| 548 | 
            +
                    matrix.each_key { |k| matrix[k] << 0.0 }
         | 
| 549 | 
            +
                    letters = IUPAC::REVCODE[iupac[i]]
         | 
| 550 | 
            +
                    (0...letters.size).each { |j|
         | 
| 551 | 
            +
                      matrix[letters[j]][-1] = 1.0/letters.size
         | 
| 552 | 
            +
                    }
         | 
| 553 | 
            +
                  }
         | 
| 554 | 
            +
                  
         | 
| 555 | 
            +
                  newppm = PPM.new(iupac.size, matrix, 4.0)
         | 
| 556 | 
            +
                  newppm.iupacomp!
         | 
| 557 | 
            +
                  
         | 
| 558 | 
            +
                  newppm
         | 
| 559 | 
            +
                end
         | 
| 560 | 
            +
                
         | 
| 561 | 
            +
              end
         | 
| 562 | 
            +
            end
         |