bioinform 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/bioinform/data_models/pcm.rb +12 -0
- data/lib/bioinform/data_models/pm.rb +38 -54
- data/lib/bioinform/data_models/pwm.rb +7 -7
- data/lib/bioinform/data_models.rb +2 -0
- data/lib/bioinform/parsers/parser.rb +40 -0
- data/lib/bioinform/{data_models/parsers → parsers}/string_fantom_parser.rb +1 -2
- data/lib/bioinform/{data_models/parsers → parsers}/string_parser.rb +7 -7
- data/lib/bioinform/parsers.rb +3 -0
- data/lib/bioinform/support/partial_sums.rb +2 -0
- data/lib/bioinform/version.rb +1 -1
- data/lib/bioinform.rb +0 -1
- data/spec/data_models/pcm_spec.rb +27 -0
- data/spec/data_models/pm_spec.rb +62 -144
- data/spec/data_models/pwm_spec.rb +3 -7
- data/spec/parsers/parser_spec.rb +58 -0
- data/spec/parsers/string_fantom_parser_spec.rb +28 -0
- data/spec/parsers/string_parser_spec.rb +46 -0
- data/spec/spec_helper.rb +13 -21
- data/spec/support/multiline_squish_spec.rb +12 -4
- data/spec/support/partial_sums_spec.rb +3 -0
- metadata +14 -18
- data/lib/bioinform/data_models/parser.rb +0 -38
- data/lib/bioinform/data_models/parsers/array_parser.rb +0 -17
- data/lib/bioinform/data_models/parsers/hash_parser.rb +0 -19
- data/lib/bioinform/data_models/parsers.rb +0 -6
- data/spec/data_models/parser_spec.rb +0 -46
- data/spec/data_models/parsers/array_parser_spec.rb +0 -53
- data/spec/data_models/parsers/hash_parser_spec.rb +0 -60
- data/spec/data_models/parsers/string_fantom_parser_spec.rb +0 -38
- data/spec/data_models/parsers/string_parser_spec.rb +0 -156
@@ -2,6 +2,18 @@ require 'bioinform/support'
|
|
2
2
|
require 'bioinform/data_models/pm'
|
3
3
|
module Bioinform
|
4
4
|
class PCM < PM
|
5
|
+
def count
|
6
|
+
matrix.first.inject(&:+)
|
7
|
+
end
|
5
8
|
|
9
|
+
def to_pwm(pseudocount = Math.log(count))
|
10
|
+
mat = each_position.map do |pos|
|
11
|
+
pos.each_index.map do |ind|
|
12
|
+
Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
PWM.new(mat)
|
16
|
+
end
|
17
|
+
|
6
18
|
end
|
7
19
|
end
|
@@ -1,87 +1,75 @@
|
|
1
1
|
require 'bioinform/support'
|
2
|
+
require 'bioinform/parsers'
|
2
3
|
|
3
4
|
module Bioinform
|
4
5
|
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3}
|
5
6
|
LetterByIndex = {0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'}
|
6
7
|
|
7
8
|
class PM
|
8
|
-
attr_reader :matrix
|
9
|
-
attr_accessor :name
|
9
|
+
attr_reader :matrix
|
10
|
+
attr_accessor :background, :name
|
10
11
|
|
11
|
-
def
|
12
|
+
def choose_parser(input)
|
13
|
+
input.is_a?(String) ? StringParser : Parser
|
14
|
+
[Parser, StringParser, StringFantomParser].find do |parser|
|
15
|
+
self.class.new(input, parser) rescue nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(input, parser = nil)
|
20
|
+
parser ||= choose_parser(input)
|
21
|
+
result = parser.new(input).parse
|
22
|
+
@matrix = result[:matrix]
|
23
|
+
@name = result[:name]
|
12
24
|
@background = [1, 1, 1, 1]
|
13
|
-
|
14
|
-
@parser = parser
|
15
|
-
return unless @input
|
16
|
-
parser_init
|
17
|
-
matrix_init
|
25
|
+
raise 'matrix not valid' unless valid?
|
18
26
|
end
|
19
27
|
|
20
28
|
def ==(other)
|
21
29
|
@matrix == other.matrix && @background == other.background
|
22
30
|
end
|
23
31
|
|
24
|
-
def parser_init
|
25
|
-
if @parser
|
26
|
-
raise ArgumentError, 'Input cannot be parsed by specified parser' unless @parser.new(@input).can_parse?
|
27
|
-
else
|
28
|
-
@parser = Parser.subclasses.find{|parser_class| parser_class.new(@input).can_parse? }
|
29
|
-
raise ArgumentError, 'No one parser can parse specified input' unless @parser
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def matrix_init
|
34
|
-
parse_result = @parser.new(@input).parse
|
35
|
-
raise ArgumentError, 'Used parser result has no `matrix` key' unless parse_result.has_key? :matrix
|
36
|
-
|
37
|
-
configure_from_hash(parse_result)
|
38
|
-
end
|
39
|
-
|
40
32
|
def valid?
|
41
|
-
@matrix.is_a?(Array) &&
|
33
|
+
@matrix.is_a?(Array) &&
|
42
34
|
@matrix.all?(&:is_a?.(Array)) &&
|
43
|
-
@matrix.all?
|
44
|
-
@matrix.all?
|
35
|
+
@matrix.all?{|pos| pos.size == 4} &&
|
36
|
+
@matrix.all?(&:all?.(&:is_a?.(Numeric)))
|
45
37
|
rescue
|
46
38
|
false
|
47
39
|
end
|
48
40
|
|
49
|
-
def
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
raise ArgumentError, 'Matrix has invalid format:' unless valid?
|
56
|
-
rescue
|
57
|
-
@matrix = old_matrix
|
58
|
-
raise
|
41
|
+
def each_position
|
42
|
+
if block_given?
|
43
|
+
matrix.each{|pos| yield pos}
|
44
|
+
else
|
45
|
+
Enumerator.new(self, :each_position)
|
46
|
+
end
|
59
47
|
end
|
60
|
-
|
61
|
-
def length
|
62
|
-
@matrix.length
|
48
|
+
|
49
|
+
def length
|
50
|
+
@matrix.length
|
63
51
|
end
|
64
52
|
alias_method :size, :length
|
65
53
|
|
66
54
|
def to_s(with_name = true)
|
67
|
-
|
55
|
+
matrix_str = each_position.map(&:join.("\t")).join("\n")
|
68
56
|
if with_name && @name
|
69
|
-
"#{@name}\n#{
|
57
|
+
"#{@name}\n#{matrix_str}"
|
70
58
|
else
|
71
|
-
|
59
|
+
matrix_str
|
72
60
|
end
|
73
61
|
end
|
74
62
|
|
75
63
|
def pretty_string(with_name = true)
|
76
64
|
header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
|
77
|
-
matrix_rows =
|
65
|
+
matrix_rows = each_position.map do |position|
|
78
66
|
position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
|
79
67
|
end
|
80
|
-
|
68
|
+
matrix_str = matrix_rows.join("\n")
|
81
69
|
if with_name && @name
|
82
|
-
@name + "\n" + header +
|
70
|
+
@name + "\n" + header + matrix_str
|
83
71
|
else
|
84
|
-
header +
|
72
|
+
header + matrix_str
|
85
73
|
end
|
86
74
|
end
|
87
75
|
|
@@ -133,17 +121,13 @@ module Bioinform
|
|
133
121
|
self
|
134
122
|
end
|
135
123
|
|
136
|
-
def background_sum
|
137
|
-
@background.inject(0.0, &:+)
|
138
|
-
end
|
139
|
-
|
140
124
|
def vocabulary_volume
|
141
|
-
|
125
|
+
background.inject(&:+) ** length
|
142
126
|
end
|
143
127
|
|
144
128
|
def probability
|
145
|
-
sum =
|
146
|
-
|
129
|
+
sum = background.inject(0.0, &:+)
|
130
|
+
background.map{|element| element.to_f / sum}
|
147
131
|
end
|
148
132
|
|
149
133
|
|
@@ -3,12 +3,12 @@ require 'bioinform/data_models/pm'
|
|
3
3
|
module Bioinform
|
4
4
|
class PWM < PM
|
5
5
|
def score_mean
|
6
|
-
|
6
|
+
each_position.inject(0){ |mean, position| mean + position.each_index.inject(0){|sum, letter| sum + position[letter] * probability[letter]} }
|
7
7
|
end
|
8
8
|
def score_variance
|
9
|
-
|
10
|
-
variance + position.each_index.inject(0
|
11
|
-
position.each_index.inject(0
|
9
|
+
each_position.inject(0) do |variance, position|
|
10
|
+
variance + position.each_index.inject(0) { |sum,letter| sum + position[letter]**2 * probability[letter] } -
|
11
|
+
position.each_index.inject(0) { |sum,letter| sum + position[letter] * probability[letter] }**2
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
@@ -20,9 +20,9 @@ module Bioinform
|
|
20
20
|
|
21
21
|
def score(word)
|
22
22
|
word = word.upcase
|
23
|
-
raise ArgumentError unless word.length == length
|
24
|
-
raise ArgumentError unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
-
word.each_char.map.with_index{|letter, pos| matrix[pos][IndexByLetter[letter]] }.inject(
|
23
|
+
raise ArgumentError, 'word in PWM#score(word) should have the same length as matrix' unless word.length == length
|
24
|
+
raise ArgumentError, 'word in PWM#score(word) should have only ACGT-letters' unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
+
word.each_char.map.with_index{|letter, pos| matrix[pos][IndexByLetter[letter]] }.inject(&:+)
|
26
26
|
end
|
27
27
|
end
|
28
28
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'bioinform/support'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
class Parser
|
5
|
+
attr_reader :input, :matrix
|
6
|
+
|
7
|
+
def initialize(input)
|
8
|
+
@input = input
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
inp = input
|
13
|
+
transpose = inp.is_a?(Hash)
|
14
|
+
inp = ClassMethods.try_convert_to_array(inp)
|
15
|
+
inp.map!{|x| ClassMethods.try_convert_to_array(x)}
|
16
|
+
transpose = true if (not inp.all?{|x| x.size == 4}) && inp.size == 4 && inp.same_by?(&:size)
|
17
|
+
@matrix = transpose ? inp.transpose : inp
|
18
|
+
result
|
19
|
+
rescue
|
20
|
+
{}
|
21
|
+
end
|
22
|
+
|
23
|
+
def result(options={})
|
24
|
+
raise 'Parsing Error' unless matrix.is_a?(Array) && matrix.all?(&:is_a?.(Array)) && matrix.all?{|pos| pos.size == 4} && matrix.all?(&:all?.(&:is_a?.(Numeric)))
|
25
|
+
options.merge(matrix: @matrix)
|
26
|
+
end
|
27
|
+
|
28
|
+
class ClassMethods
|
29
|
+
def self.array_from_acgt_hash(hsh)
|
30
|
+
hsh = hsh.collect_hash{|key,value| [key.to_s.upcase, value] }
|
31
|
+
raise 'some of hash keys A,C,G,T are missing or hash has excess keys' unless hsh.keys.sort == %w[A C G T]
|
32
|
+
%w[A C G T].collect{|letter| hsh[letter] }
|
33
|
+
end
|
34
|
+
def self.try_convert_to_array(inp)
|
35
|
+
return inp if inp.is_a? Array
|
36
|
+
array_from_acgt_hash(inp)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
require 'bioinform/support'
|
2
|
-
require 'bioinform/
|
3
|
-
require 'bioinform/data_models/parsers/array_parser'
|
2
|
+
require 'bioinform/parsers/parser'
|
4
3
|
|
5
|
-
module Bioinform
|
4
|
+
module Bioinform
|
6
5
|
class StringParser < Parser
|
7
6
|
def number_pat
|
8
7
|
'[+-]?\d+(\.\d+)?([eE][+-]?\d{1,3})?'
|
@@ -28,18 +27,19 @@ module Bioinform
|
|
28
27
|
matrix.split("\n").map{|line| line.split.map(&:to_f)}
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
30
|
+
def parse
|
32
31
|
case input
|
33
|
-
when String
|
32
|
+
when String
|
34
33
|
match = input.multiline_squish.match(pattern)
|
35
34
|
raise ArgumentError unless match
|
36
35
|
matrix = matrix_preprocess( match[:matrix] )
|
37
36
|
raise ArgumentError unless matrix
|
38
|
-
|
39
|
-
match[:name] ? result.merge(name: match[:name]) : result
|
37
|
+
Parser.new(matrix).parse.merge(name: match[:name])
|
40
38
|
else
|
41
39
|
raise ArgumentError
|
42
40
|
end
|
41
|
+
rescue
|
42
|
+
{}
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
data/lib/bioinform/version.rb
CHANGED
data/lib/bioinform.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'bioinform/data_models/pcm'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
describe PCM do
|
6
|
+
describe '#count' do
|
7
|
+
it 'should be equal to sum of elements at position' do
|
8
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).count.should == 7
|
9
|
+
PCM.new([[1, 2.3, 3.2, 1],[4.4, 1.1, 1, 2]]).count.should == 7.5
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe '#to_pwm' do
|
14
|
+
it 'should return PWM' do
|
15
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm.should be_kind_of(PWM)
|
16
|
+
end
|
17
|
+
it 'should make transformation: el --> log( (el + p_i*pseudocount) / (p_i*(count + pseudocount)) )' do
|
18
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(1).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.47, 0.118,0.486,-0.47],[0.754,-0.47,-0.47,0.118]]
|
19
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(10).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.194, 0.057,0.258,-0.194],[0.425,-0.194,-0.194,0.057]]
|
20
|
+
end
|
21
|
+
it 'should use default pseudocount equal to log(count)' do
|
22
|
+
PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm.should == PCM.new([[1, 2, 3, 1],[4, 1, 1, 2]]).to_pwm(Math.log(7))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|