bioinform 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/bioinform/data_models/pcm.rb +12 -0
- data/lib/bioinform/data_models/pm.rb +38 -54
- data/lib/bioinform/data_models/pwm.rb +7 -7
- data/lib/bioinform/data_models.rb +2 -0
- data/lib/bioinform/parsers/parser.rb +40 -0
- data/lib/bioinform/{data_models/parsers → parsers}/string_fantom_parser.rb +1 -2
- data/lib/bioinform/{data_models/parsers → parsers}/string_parser.rb +7 -7
- data/lib/bioinform/parsers.rb +3 -0
- data/lib/bioinform/support/partial_sums.rb +2 -0
- data/lib/bioinform/version.rb +1 -1
- data/lib/bioinform.rb +0 -1
- data/spec/data_models/pcm_spec.rb +27 -0
- data/spec/data_models/pm_spec.rb +62 -144
- data/spec/data_models/pwm_spec.rb +3 -7
- data/spec/parsers/parser_spec.rb +58 -0
- data/spec/parsers/string_fantom_parser_spec.rb +28 -0
- data/spec/parsers/string_parser_spec.rb +46 -0
- data/spec/spec_helper.rb +13 -21
- data/spec/support/multiline_squish_spec.rb +12 -4
- data/spec/support/partial_sums_spec.rb +3 -0
- metadata +14 -18
- data/lib/bioinform/data_models/parser.rb +0 -38
- data/lib/bioinform/data_models/parsers/array_parser.rb +0 -17
- data/lib/bioinform/data_models/parsers/hash_parser.rb +0 -19
- data/lib/bioinform/data_models/parsers.rb +0 -6
- data/spec/data_models/parser_spec.rb +0 -46
- data/spec/data_models/parsers/array_parser_spec.rb +0 -53
- data/spec/data_models/parsers/hash_parser_spec.rb +0 -60
- data/spec/data_models/parsers/string_fantom_parser_spec.rb +0 -38
- data/spec/data_models/parsers/string_parser_spec.rb +0 -156
@@ -2,6 +2,18 @@ require 'bioinform/support'
|
|
2
2
|
require 'bioinform/data_models/pm'
|
3
3
|
module Bioinform
|
4
4
|
class PCM < PM
|
5
|
+
def count
|
6
|
+
matrix.first.inject(&:+)
|
7
|
+
end
|
5
8
|
|
9
|
+
def to_pwm(pseudocount = Math.log(count))
|
10
|
+
mat = each_position.map do |pos|
|
11
|
+
pos.each_index.map do |ind|
|
12
|
+
Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
PWM.new(mat)
|
16
|
+
end
|
17
|
+
|
6
18
|
end
|
7
19
|
end
|
@@ -1,87 +1,75 @@
|
|
1
1
|
require 'bioinform/support'
|
2
|
+
require 'bioinform/parsers'
|
2
3
|
|
3
4
|
module Bioinform
|
4
5
|
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3}
|
5
6
|
LetterByIndex = {0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'}
|
6
7
|
|
7
8
|
class PM
|
8
|
-
attr_reader :matrix
|
9
|
-
attr_accessor :name
|
9
|
+
attr_reader :matrix
|
10
|
+
attr_accessor :background, :name
|
10
11
|
|
11
|
-
def
|
12
|
+
def choose_parser(input)
|
13
|
+
input.is_a?(String) ? StringParser : Parser
|
14
|
+
[Parser, StringParser, StringFantomParser].find do |parser|
|
15
|
+
self.class.new(input, parser) rescue nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(input, parser = nil)
|
20
|
+
parser ||= choose_parser(input)
|
21
|
+
result = parser.new(input).parse
|
22
|
+
@matrix = result[:matrix]
|
23
|
+
@name = result[:name]
|
12
24
|
@background = [1, 1, 1, 1]
|
13
|
-
|
14
|
-
@parser = parser
|
15
|
-
return unless @input
|
16
|
-
parser_init
|
17
|
-
matrix_init
|
25
|
+
raise 'matrix not valid' unless valid?
|
18
26
|
end
|
19
27
|
|
20
28
|
def ==(other)
|
21
29
|
@matrix == other.matrix && @background == other.background
|
22
30
|
end
|
23
31
|
|
24
|
-
def parser_init
|
25
|
-
if @parser
|
26
|
-
raise ArgumentError, 'Input cannot be parsed by specified parser' unless @parser.new(@input).can_parse?
|
27
|
-
else
|
28
|
-
@parser = Parser.subclasses.find{|parser_class| parser_class.new(@input).can_parse? }
|
29
|
-
raise ArgumentError, 'No one parser can parse specified input' unless @parser
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def matrix_init
|
34
|
-
parse_result = @parser.new(@input).parse
|
35
|
-
raise ArgumentError, 'Used parser result has no `matrix` key' unless parse_result.has_key? :matrix
|
36
|
-
|
37
|
-
configure_from_hash(parse_result)
|
38
|
-
end
|
39
|
-
|
40
32
|
def valid?
|
41
|
-
@matrix.is_a?(Array) &&
|
33
|
+
@matrix.is_a?(Array) &&
|
42
34
|
@matrix.all?(&:is_a?.(Array)) &&
|
43
|
-
@matrix.all?
|
44
|
-
@matrix.all?
|
35
|
+
@matrix.all?{|pos| pos.size == 4} &&
|
36
|
+
@matrix.all?(&:all?.(&:is_a?.(Numeric)))
|
45
37
|
rescue
|
46
38
|
false
|
47
39
|
end
|
48
40
|
|
49
|
-
def
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
raise ArgumentError, 'Matrix has invalid format:' unless valid?
|
56
|
-
rescue
|
57
|
-
@matrix = old_matrix
|
58
|
-
raise
|
41
|
+
def each_position
|
42
|
+
if block_given?
|
43
|
+
matrix.each{|pos| yield pos}
|
44
|
+
else
|
45
|
+
Enumerator.new(self, :each_position)
|
46
|
+
end
|
59
47
|
end
|
60
|
-
|
61
|
-
def length
|
62
|
-
@matrix.length
|
48
|
+
|
49
|
+
def length
|
50
|
+
@matrix.length
|
63
51
|
end
|
64
52
|
alias_method :size, :length
|
65
53
|
|
66
54
|
def to_s(with_name = true)
|
67
|
-
|
55
|
+
matrix_str = each_position.map(&:join.("\t")).join("\n")
|
68
56
|
if with_name && @name
|
69
|
-
"#{@name}\n#{
|
57
|
+
"#{@name}\n#{matrix_str}"
|
70
58
|
else
|
71
|
-
|
59
|
+
matrix_str
|
72
60
|
end
|
73
61
|
end
|
74
62
|
|
75
63
|
def pretty_string(with_name = true)
|
76
64
|
header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
|
77
|
-
matrix_rows =
|
65
|
+
matrix_rows = each_position.map do |position|
|
78
66
|
position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
|
79
67
|
end
|
80
|
-
|
68
|
+
matrix_str = matrix_rows.join("\n")
|
81
69
|
if with_name && @name
|
82
|
-
@name + "\n" + header +
|
70
|
+
@name + "\n" + header + matrix_str
|
83
71
|
else
|
84
|
-
header +
|
72
|
+
header + matrix_str
|
85
73
|
end
|
86
74
|
end
|
87
75
|
|
@@ -133,17 +121,13 @@ module Bioinform
|
|
133
121
|
self
|
134
122
|
end
|
135
123
|
|
136
|
-
def background_sum
|
137
|
-
@background.inject(0.0, &:+)
|
138
|
-
end
|
139
|
-
|
140
124
|
def vocabulary_volume
|
141
|
-
|
125
|
+
background.inject(&:+) ** length
|
142
126
|
end
|
143
127
|
|
144
128
|
def probability
|
145
|
-
sum =
|
146
|
-
|
129
|
+
sum = background.inject(0.0, &:+)
|
130
|
+
background.map{|element| element.to_f / sum}
|
147
131
|
end
|
148
132
|
|
149
133
|
|
@@ -3,12 +3,12 @@ require 'bioinform/data_models/pm'
|
|
3
3
|
module Bioinform
|
4
4
|
class PWM < PM
|
5
5
|
def score_mean
|
6
|
-
|
6
|
+
each_position.inject(0){ |mean, position| mean + position.each_index.inject(0){|sum, letter| sum + position[letter] * probability[letter]} }
|
7
7
|
end
|
8
8
|
def score_variance
|
9
|
-
|
10
|
-
variance + position.each_index.inject(0
|
11
|
-
position.each_index.inject(0
|
9
|
+
each_position.inject(0) do |variance, position|
|
10
|
+
variance + position.each_index.inject(0) { |sum,letter| sum + position[letter]**2 * probability[letter] } -
|
11
|
+
position.each_index.inject(0) { |sum,letter| sum + position[letter] * probability[letter] }**2
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
@@ -20,9 +20,9 @@ module Bioinform
|
|
20
20
|
|
21
21
|
def score(word)
|
22
22
|
word = word.upcase
|
23
|
-
raise ArgumentError unless word.length == length
|
24
|
-
raise ArgumentError unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
-
word.each_char.map.with_index{|letter, pos| matrix[pos][IndexByLetter[letter]] }.inject(
|
23
|
+
raise ArgumentError, 'word in PWM#score(word) should have the same length as matrix' unless word.length == length
|
24
|
+
raise ArgumentError, 'word in PWM#score(word) should have only ACGT-letters' unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
+
word.each_char.map.with_index{|letter, pos| matrix[pos][IndexByLetter[letter]] }.inject(&:+)
|
26
26
|
end
|
27
27
|
end
|
28
28
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'bioinform/support'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
class Parser
|
5
|
+
attr_reader :input, :matrix
|
6
|
+
|
7
|
+
def initialize(input)
|
8
|
+
@input = input
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
inp = input
|
13
|
+
transpose = inp.is_a?(Hash)
|
14
|
+
inp = ClassMethods.try_convert_to_array(inp)
|
15
|
+
inp.map!{|x| ClassMethods.try_convert_to_array(x)}
|
16
|
+
transpose = true if (not inp.all?{|x| x.size == 4}) && inp.size == 4 && inp.same_by?(&:size)
|
17
|
+
@matrix = transpose ? inp.transpose : inp
|
18
|
+
result
|
19
|
+
rescue
|
20
|
+
{}
|
21
|
+
end
|
22
|
+
|
23
|
+
def result(options={})
|
24
|
+
raise 'Parsing Error' unless matrix.is_a?(Array) && matrix.all?(&:is_a?.(Array)) && matrix.all?{|pos| pos.size == 4} && matrix.all?(&:all?.(&:is_a?.(Numeric)))
|
25
|
+
options.merge(matrix: @matrix)
|
26
|
+
end
|
27
|
+
|
28
|
+
class ClassMethods
|
29
|
+
def self.array_from_acgt_hash(hsh)
|
30
|
+
hsh = hsh.collect_hash{|key,value| [key.to_s.upcase, value] }
|
31
|
+
raise 'some of hash keys A,C,G,T are missing or hash has excess keys' unless hsh.keys.sort == %w[A C G T]
|
32
|
+
%w[A C G T].collect{|letter| hsh[letter] }
|
33
|
+
end
|
34
|
+
def self.try_convert_to_array(inp)
|
35
|
+
return inp if inp.is_a? Array
|
36
|
+
array_from_acgt_hash(inp)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
require 'bioinform/support'
|
2
|
-
require 'bioinform/
|
3
|
-
require 'bioinform/data_models/parsers/array_parser'
|
2
|
+
require 'bioinform/parsers/parser'
|
4
3
|
|
5
|
-
module Bioinform
|
4
|
+
module Bioinform
|
6
5
|
class StringParser < Parser
|
7
6
|
def number_pat
|
8
7
|
'[+-]?\d+(\.\d+)?([eE][+-]?\d{1,3})?'
|
@@ -28,18 +27,19 @@ module Bioinform
|
|
28
27
|
matrix.split("\n").map{|line| line.split.map(&:to_f)}
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
30
|
+
def parse
|
32
31
|
case input
|
33
|
-
when String
|
32
|
+
when String
|
34
33
|
match = input.multiline_squish.match(pattern)
|
35
34
|
raise ArgumentError unless match
|
36
35
|
matrix = matrix_preprocess( match[:matrix] )
|
37
36
|
raise ArgumentError unless matrix
|
38
|
-
|
39
|
-
match[:name] ? result.merge(name: match[:name]) : result
|
37
|
+
Parser.new(matrix).parse.merge(name: match[:name])
|
40
38
|
else
|
41
39
|
raise ArgumentError
|
42
40
|
end
|
41
|
+
rescue
|
42
|
+
{}
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
data/lib/bioinform/version.rb
CHANGED
data/lib/bioinform.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'bioinform/data_models/pcm'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
describe PCM do
|
6
|
+
describe '#count' do
|
7
|
+
it 'should be equal to sum of elements at position' do
|
8
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).count.should == 7
|
9
|
+
PCM.new([[1, 2.3, 3.2, 1],[4.4, 1.1, 1, 2]]).count.should == 7.5
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe '#to_pwm' do
|
14
|
+
it 'should return PWM' do
|
15
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm.should be_kind_of(PWM)
|
16
|
+
end
|
17
|
+
it 'should make transformation: el --> log( (el + p_i*pseudocount) / (p_i*(count + pseudocount)) )' do
|
18
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(1).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.47, 0.118,0.486,-0.47],[0.754,-0.47,-0.47,0.118]]
|
19
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(10).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.194, 0.057,0.258,-0.194],[0.425,-0.194,-0.194,0.057]]
|
20
|
+
end
|
21
|
+
it 'should use default pseudocount equal to log(count)' do
|
22
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm.should == PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(Math.log(7))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|