bioinform 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/bioinform/data_models/pcm.rb +8 -1
- data/lib/bioinform/data_models/pm.rb +20 -15
- data/lib/bioinform/data_models/ppm.rb +3 -1
- data/lib/bioinform/data_models/pwm.rb +14 -2
- data/lib/bioinform/parsers/parser.rb +67 -28
- data/lib/bioinform/parsers/string_fantom_parser.rb +3 -9
- data/lib/bioinform/parsers/string_parser.rb +64 -24
- data/lib/bioinform/parsers/trivial_parser.rb +17 -0
- data/lib/bioinform/parsers.rb +1 -0
- data/lib/bioinform/support/advanced_scan.rb +8 -0
- data/lib/bioinform/support/multiline_squish.rb +1 -1
- data/lib/bioinform/support.rb +3 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/data_models/pcm_spec.rb +24 -6
- data/spec/data_models/pm_spec.rb +15 -10
- data/spec/data_models/ppm_spec.rb +8 -0
- data/spec/parsers/parser_spec.rb +89 -0
- data/spec/parsers/string_fantom_parser_spec.rb +16 -14
- data/spec/parsers/string_parser_spec.rb +46 -0
- data/spec/parsers/trivial_parser_spec.rb +22 -0
- data/spec/spec_helper.rb +16 -10
- data/spec/support/advanced_scan_spec.rb +32 -0
- data/spec/support/multiline_squish_spec.rb +6 -0
- metadata +10 -2
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'bioinform/support'
|
2
2
|
require 'bioinform/data_models/pm'
|
3
|
+
require 'bioinform/data_models/ppm'
|
4
|
+
require 'bioinform/data_models/pwm'
|
3
5
|
module Bioinform
|
4
6
|
class PCM < PM
|
5
7
|
def count
|
@@ -12,7 +14,12 @@ module Bioinform
|
|
12
14
|
Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
|
13
15
|
end
|
14
16
|
end
|
15
|
-
PWM.new(mat)
|
17
|
+
PWM.new(matrix: mat, name: name)
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_ppm
|
21
|
+
mat = each_position.map{|pos| pos.map{|el| el.to_f / count }}
|
22
|
+
PPM.new(matrix: mat, name: name)
|
16
23
|
end
|
17
24
|
|
18
25
|
end
|
@@ -2,23 +2,23 @@ require 'bioinform/support'
|
|
2
2
|
require 'bioinform/parsers'
|
3
3
|
|
4
4
|
module Bioinform
|
5
|
-
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3}
|
6
|
-
LetterByIndex = {0 =>
|
7
|
-
|
5
|
+
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3, A: 0, C: 1, G: 2, T: 3}
|
6
|
+
LetterByIndex = {0 => :A, 1 => :C, 2 => :G, 3 => :T}
|
7
|
+
|
8
8
|
class PM
|
9
9
|
attr_reader :matrix
|
10
10
|
attr_accessor :background, :name
|
11
11
|
|
12
|
-
def choose_parser(input)
|
13
|
-
|
14
|
-
|
15
|
-
self.class.new(input, parser) rescue nil
|
12
|
+
def self.choose_parser(input)
|
13
|
+
[TrivialParser, Parser, StringParser, StringFantomParser].find do |parser|
|
14
|
+
self.new(input, parser) rescue nil
|
16
15
|
end
|
17
16
|
end
|
18
17
|
|
19
18
|
def initialize(input, parser = nil)
|
20
|
-
parser ||= choose_parser(input)
|
21
|
-
|
19
|
+
parser ||= self.class.choose_parser(input)
|
20
|
+
raise 'No one parser can process input' unless parser
|
21
|
+
result = parser.new(input).parse
|
22
22
|
@matrix = result[:matrix]
|
23
23
|
@name = result[:name]
|
24
24
|
@background = [1, 1, 1, 1]
|
@@ -29,15 +29,20 @@ module Bioinform
|
|
29
29
|
@matrix == other.matrix && @background == other.background
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
def self.valid_matrix?(matrix)
|
33
|
+
matrix.is_a?(Array) &&
|
34
|
+
! matrix.empty? &&
|
35
|
+
matrix.all?(&:is_a?.(Array)) &&
|
36
|
+
matrix.all?{|pos| pos.size == 4} &&
|
37
|
+
matrix.all?(&:all?.(&:is_a?.(Numeric)))
|
37
38
|
rescue
|
38
39
|
false
|
39
40
|
end
|
40
41
|
|
42
|
+
def valid?
|
43
|
+
self.class.valid_matrix?(@matrix)
|
44
|
+
end
|
45
|
+
|
41
46
|
def each_position
|
42
47
|
if block_given?
|
43
48
|
matrix.each{|pos| yield pos}
|
@@ -54,7 +59,7 @@ module Bioinform
|
|
54
59
|
def to_s(with_name = true)
|
55
60
|
matrix_str = each_position.map(&:join.("\t")).join("\n")
|
56
61
|
if with_name && @name
|
57
|
-
|
62
|
+
@name + "\n" + matrix_str
|
58
63
|
else
|
59
64
|
matrix_str
|
60
65
|
end
|
@@ -21,8 +21,20 @@ module Bioinform
|
|
21
21
|
def score(word)
|
22
22
|
word = word.upcase
|
23
23
|
raise ArgumentError, 'word in PWM#score(word) should have the same length as matrix' unless word.length == length
|
24
|
-
raise ArgumentError, 'word in PWM#score(word) should have only ACGT-letters' unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
-
|
24
|
+
#raise ArgumentError, 'word in PWM#score(word) should have only ACGT-letters' unless word.each_char.all?{|letter| %w{A C G T}.include? letter}
|
25
|
+
(0...length).map do |pos|
|
26
|
+
begin
|
27
|
+
# Need support of N-letters and other IUPAC
|
28
|
+
letter = word[pos]
|
29
|
+
matrix[pos][IndexByLetter[letter]]
|
30
|
+
rescue
|
31
|
+
raise ArgumentError, 'word in PWM#score(word) should have only ACGT-letters'
|
32
|
+
end
|
33
|
+
end.inject(&:+)
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_pwm
|
37
|
+
self
|
26
38
|
end
|
27
39
|
end
|
28
40
|
end
|
@@ -1,40 +1,79 @@
|
|
1
1
|
require 'bioinform/support'
|
2
|
+
require 'bioinform/data_models/pm'
|
2
3
|
|
3
4
|
module Bioinform
|
4
5
|
class Parser
|
5
|
-
attr_reader :input
|
6
|
+
attr_reader :input
|
6
7
|
|
7
|
-
def initialize(input)
|
8
|
-
|
8
|
+
def initialize(*input)
|
9
|
+
if input.size == 1 # [ [1,2,3,4] ], [ [[1,2,3,4],[5,6,7,8]] ]
|
10
|
+
if input.first.is_a?(Array) && input.first.all?{|el| el.is_a? Numeric} # [ [1,2,3,4] ]
|
11
|
+
@input = input
|
12
|
+
else # [ [[1,2,3,4],[5,6,7,8]] ]
|
13
|
+
@input = input.first
|
14
|
+
end
|
15
|
+
else #[ [1,2,3,4], [5,6,7,8] ], [ ]
|
16
|
+
@input = input
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse!
|
21
|
+
matrix = self.class.transform_input(input)
|
22
|
+
raise 'Parsing error' unless self.class.valid_matrix?(matrix)
|
23
|
+
{matrix: matrix}
|
9
24
|
end
|
10
25
|
|
11
26
|
def parse
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
27
|
+
parse! rescue nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.parse!(*input)
|
31
|
+
self.new(*input).parse!
|
32
|
+
end
|
33
|
+
def self.parse(*input)
|
34
|
+
self.new(*input).parse
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.valid_matrix?(matrix)
|
38
|
+
PM.valid_matrix?(matrix)
|
39
|
+
end
|
40
|
+
|
41
|
+
# {A: 1, C: 2, G: 3, T: 4} --> [1,2,3,4]
|
42
|
+
# {A: [1,2], C: [3,4], G: [5,6], T: [7,8]} --> [[1,3,5,7],[2,4,6,8]] ( == [[1,2], [3,4], [5,6], [7,8]].transpose)
|
43
|
+
def self.array_from_acgt_hash(hsh)
|
44
|
+
hsh = normalize_hash_keys(hsh)
|
45
|
+
raise 'some of hash keys A,C,G,T are missing or hash has excess keys' unless hsh.keys.sort == [:A,:C,:G,:T]
|
46
|
+
result = [:A,:C,:G,:T].collect{|letter| hsh[letter] }
|
47
|
+
result.all?{|el| el.is_a?(Array)} ? result.transpose : result
|
48
|
+
end
|
49
|
+
|
50
|
+
# {a: 1, C: 2, 'g' => 3, 'T' => 4} --> {A: 1, C: 2, G: 3, T: 4}
|
51
|
+
def self.normalize_hash_keys(hsh)
|
52
|
+
hsh.collect_hash{|key,value| [key.to_s.upcase.to_sym, value] }
|
53
|
+
end
|
54
|
+
|
55
|
+
# [[1,2,3,4], [2,3,4,5]] --> [[1,2,3,4], [2,3,4,5]]
|
56
|
+
# [{A:1, C:2, G:3, T:4}, {A:2, C:3, G:4, T:5}] --> [{A:1, C:2, G:3, T:4}, {A:2, C:3, G:4, T:5}]
|
57
|
+
# {:A => [1,2,3], :c => [2,3,4], 'g' => [3,4,5], 'T' => [4,5,6]} --> [[1,2,3],[2,3,4],[3,4,5],[4,5,6]].transpose
|
58
|
+
def self.try_convert_to_array(input)
|
59
|
+
case input
|
60
|
+
when Array then input
|
61
|
+
when Hash then array_from_acgt_hash(input)
|
62
|
+
else raise TypeError, 'input of Bioinform::Parser::array_from_acgt_hash should be Array or Hash'
|
37
63
|
end
|
38
64
|
end
|
65
|
+
|
66
|
+
def self.transform_input(input)
|
67
|
+
result = try_convert_to_array(input).map{|el| try_convert_to_array(el)}
|
68
|
+
need_tranpose?(result) ? result.transpose : result
|
69
|
+
end
|
70
|
+
|
71
|
+
# point whether matrix input positions(need not be transposed -- false) or letters(need -- true) as first index
|
72
|
+
# [[1,3,5,7], [2,4,6,8]] --> false
|
73
|
+
# [[1,2],[3,4],[5,6],[7,8]] --> true
|
74
|
+
def self.need_tranpose?(input)
|
75
|
+
(input.size == 4) && input.any?{|x| x.size != 4}
|
76
|
+
end
|
77
|
+
|
39
78
|
end
|
40
79
|
end
|
@@ -3,18 +3,12 @@ require 'bioinform/parsers/string_parser'
|
|
3
3
|
|
4
4
|
module Bioinform
|
5
5
|
class StringFantomParser < StringParser
|
6
|
-
def row_pat
|
7
|
-
'[\w\d]+ ' + "(#{number_pat} )*#{number_pat}"
|
8
|
-
end
|
9
|
-
def name_pat
|
10
|
-
'NA (?<name>[\w.+:-]+)'
|
11
|
-
end
|
12
6
|
def header_pat
|
13
|
-
|
7
|
+
/NA (?<name>[\w.+:-]+)\n[\w\d]+ A C G T\n/
|
14
8
|
end
|
15
9
|
|
16
|
-
def
|
17
|
-
|
10
|
+
def row_pat
|
11
|
+
/[\w\d]+ (?<row>(#{number_pat} )*#{number_pat})\n?/
|
18
12
|
end
|
19
13
|
end
|
20
14
|
end
|
@@ -1,45 +1,85 @@
|
|
1
|
+
require 'strscan'
|
1
2
|
require 'bioinform/support'
|
2
3
|
require 'bioinform/parsers/parser'
|
3
4
|
|
4
5
|
module Bioinform
|
5
6
|
class StringParser < Parser
|
7
|
+
attr_reader :scanner
|
8
|
+
def initialize(input)
|
9
|
+
raise ArgumentError unless input.is_a?(String)
|
10
|
+
super
|
11
|
+
@scanner = StringScanner.new(input.multiline_squish)
|
12
|
+
end
|
13
|
+
|
6
14
|
def number_pat
|
7
|
-
|
15
|
+
/[+-]?\d+(\.\d+)?([eE][+-]?\d{1,3})?/
|
8
16
|
end
|
17
|
+
|
18
|
+
def header_pat
|
19
|
+
/>?\s*(?<name>\S+)\n/
|
20
|
+
end
|
21
|
+
|
9
22
|
def row_pat
|
10
|
-
|
23
|
+
/(?<row>(#{number_pat} )*#{number_pat})\n?/
|
11
24
|
end
|
12
|
-
|
13
|
-
|
25
|
+
|
26
|
+
def scan_row
|
27
|
+
match = scanner.advanced_scan(row_pat)
|
28
|
+
match && match[:row]
|
14
29
|
end
|
15
|
-
|
16
|
-
|
30
|
+
|
31
|
+
def split_row(row_string)
|
32
|
+
row_string.split.map(&:to_f)
|
17
33
|
end
|
18
|
-
|
19
|
-
|
34
|
+
|
35
|
+
def scan_any_spaces
|
36
|
+
scanner.scan(/\s+/)
|
20
37
|
end
|
21
|
-
|
22
|
-
|
38
|
+
|
39
|
+
def parse_name
|
40
|
+
match = scanner.advanced_scan(header_pat)
|
41
|
+
match && match[:name]
|
23
42
|
end
|
24
43
|
|
25
|
-
|
26
|
-
|
27
|
-
|
44
|
+
def parse_matrix
|
45
|
+
matrix = []
|
46
|
+
while row_string = scan_row
|
47
|
+
matrix << split_row(row_string)
|
48
|
+
end
|
49
|
+
matrix
|
28
50
|
end
|
29
51
|
|
30
|
-
def parse
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
52
|
+
def parse!
|
53
|
+
scan_any_spaces
|
54
|
+
name = parse_name
|
55
|
+
matrix = parse_matrix
|
56
|
+
Parser.parse!(matrix).merge(name: name)
|
57
|
+
end
|
58
|
+
|
59
|
+
def scanner_reset
|
60
|
+
scanner.reset
|
61
|
+
end
|
62
|
+
|
63
|
+
def each
|
64
|
+
if block_given?
|
65
|
+
scanner_reset
|
66
|
+
while result = parse
|
67
|
+
yield result
|
68
|
+
end
|
38
69
|
else
|
39
|
-
|
70
|
+
Enumerator.new(self, :each)
|
40
71
|
end
|
41
|
-
rescue
|
42
|
-
{}
|
43
72
|
end
|
73
|
+
include Enumerable
|
74
|
+
|
75
|
+
alias_method :split, :to_a
|
76
|
+
def self.split(input)
|
77
|
+
self.new(input).split
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.split_on_motifs(input, pm_klass = PM)
|
81
|
+
split(input).map{|el| pm_klass.new(el)}
|
82
|
+
end
|
83
|
+
|
44
84
|
end
|
45
85
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'bioinform/support'
|
2
|
+
require 'bioinform/parsers/parser'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
# TrivialParser can be used to parse hashes returned by #parse method of other parsers:
|
6
|
+
# PM.new({matrix:[[1,2,3,4],[5,6,7,8]], name: 'Name'}, TrivialParser)
|
7
|
+
# PM.new(StringParser.new("1 2 3 4\n5 6 7 8").parse)
|
8
|
+
# StringParser.new("First\n1 2 3 4\n5 6 7 8\nSecond\n0 0 0 0").map{|inp| PM.new(inp, TrivialParser)}
|
9
|
+
class TrivialParser < Parser
|
10
|
+
def initialize(input)
|
11
|
+
@input = input
|
12
|
+
end
|
13
|
+
def parse!
|
14
|
+
input
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/bioinform/parsers.rb
CHANGED
data/lib/bioinform/support.rb
CHANGED
data/lib/bioinform/version.rb
CHANGED
@@ -5,23 +5,41 @@ module Bioinform
|
|
5
5
|
describe PCM do
|
6
6
|
describe '#count' do
|
7
7
|
it 'should be equal to sum of elements at position' do
|
8
|
-
PCM.new([[1, 2, 3, 1],[4,
|
9
|
-
PCM.new([[1, 2.3, 3.2, 1],[4.4,
|
8
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).count.should == 7
|
9
|
+
PCM.new([[1, 2.3, 3.2, 1],[4.4, 0.1, 1, 2]]).count.should == 7.5
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
13
13
|
describe '#to_pwm' do
|
14
14
|
it 'should return PWM' do
|
15
|
-
PCM.new([[1, 2, 3, 1],[4,
|
15
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_pwm.should be_kind_of(PWM)
|
16
16
|
end
|
17
17
|
it 'should make transformation: el --> log( (el + p_i*pseudocount) / (p_i*(count + pseudocount)) )' do
|
18
|
-
PCM.new([[1, 2, 3, 1],[4,
|
19
|
-
PCM.new([[1, 2, 3, 1],[4,
|
18
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_pwm(1).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.47, 0.118, 0.486, -0.47],[0.754, -2.079, -0.47, 0.118]]
|
19
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_pwm(10).matrix.map{|line|line.map{|el| el.round(3)}}.should == [[-0.194, 0.057, 0.258, -0.194],[0.425, -0.531, -0.194, 0.057]]
|
20
20
|
end
|
21
21
|
it 'should use default pseudocount equal to log(count)' do
|
22
|
-
PCM.new([[1, 2, 3, 1],[4,
|
22
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_pwm.should == PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_pwm(Math.log(7))
|
23
|
+
end
|
24
|
+
it 'should preserve name' do
|
25
|
+
PCM.new(matrix: [[1, 2, 3, 1],[4, 0, 1, 2]], name: nil).to_pwm.name.should be_nil
|
26
|
+
PCM.new(matrix: [[1, 2, 3, 1],[4, 0, 1, 2]], name: 'Stub name').to_pwm.name.should == 'Stub name'
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
30
|
+
describe '#to_ppm' do
|
31
|
+
it 'should return PPM' do
|
32
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_ppm.should be_kind_of(PPM)
|
33
|
+
end
|
34
|
+
it 'should make transformation el --> el / count' do
|
35
|
+
PCM.new([[1, 2, 3, 1],[4, 0, 1, 2]]).to_ppm.should == PPM.new([[1.0/7, 2.0/7, 3.0/7, 1.0/7],[4.0/7, 0.0/7, 1.0/7, 2.0/7]])
|
36
|
+
end
|
37
|
+
it 'should preserve name' do
|
38
|
+
PCM.new(matrix: [[1, 2, 3, 1],[4, 0, 1, 2]], name: nil).to_ppm.name.should be_nil
|
39
|
+
PCM.new(matrix: [[1, 2, 3, 1],[4, 0, 1, 2]], name: 'Stub name').to_ppm.name.should == 'Stub name'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
26
44
|
end
|
27
45
|
end
|
data/spec/data_models/pm_spec.rb
CHANGED
@@ -3,18 +3,17 @@ require 'bioinform/data_models/pm'
|
|
3
3
|
|
4
4
|
module Bioinform
|
5
5
|
describe PM do
|
6
|
-
|
6
|
+
|
7
|
+
describe '::valid_matrix?' do
|
7
8
|
it 'should be true iff an argument is an array of arrays of 4 numerics in a column' do
|
8
|
-
|
9
|
-
PM.
|
10
|
-
PM.
|
11
|
-
PM.
|
12
|
-
PM.
|
13
|
-
PM.
|
14
|
-
PM.new([[0,0,0,0]]).instance_eval{@matrix = [[1,2,'3','4'],[1,'4','5',6.5]]; self }.valid?.should be_false
|
15
|
-
|
9
|
+
PM.valid_matrix?( [[1,2,3,4],[1,4,5,6.5]] ).should be_true
|
10
|
+
PM.valid_matrix?( {A: [1,1], C: [2,4], G: [3,5], T: [4, 6.5]} ).should be_false
|
11
|
+
PM.valid_matrix?( [{A:1,C:2,G:3,T:4},{A:1,C:4,G:5,T: 6.5}] ).should be_false
|
12
|
+
PM.valid_matrix?( [[1,2,3,4],[1,4,6.5]] ).should be_false
|
13
|
+
PM.valid_matrix?( [[1,2,3],[1,4,6.5]] ).should be_false
|
14
|
+
PM.valid_matrix?( [[1,2,'3','4'],[1,'4','5',6.5]] ).should be_false
|
16
15
|
end
|
17
|
-
end
|
16
|
+
end
|
18
17
|
|
19
18
|
describe '#to_s' do
|
20
19
|
before :each do
|
@@ -249,6 +248,12 @@ module Bioinform
|
|
249
248
|
@pm.best_suffix(2).should == (-1.0)
|
250
249
|
@pm.best_suffix(3).should == (0.0)
|
251
250
|
end
|
251
|
+
it 'should give right results after left(right)_augment, discrete, reverse_complement etc' do
|
252
|
+
pm = PM.new([[1, 2, 3, 4], [10,10.5,11,11.5]])
|
253
|
+
pm.best_suffix(1).should == 11.5
|
254
|
+
pm.left_augment!(1)
|
255
|
+
pm.best_suffix(1).should == 15.5
|
256
|
+
end
|
252
257
|
end
|
253
258
|
describe '#worst_suffix' do
|
254
259
|
it 'should return minimal score of suffices from i-th position inclusively i.e. [i..end]' do
|
data/spec/parsers/parser_spec.rb
CHANGED
@@ -3,6 +3,86 @@ require 'bioinform/parsers/parser'
|
|
3
3
|
|
4
4
|
module Bioinform
|
5
5
|
describe Parser do
|
6
|
+
context '#initialize' do
|
7
|
+
it 'should accept an array correctly' do
|
8
|
+
Parser.new([[1,2,3,4],[5,6,7,8]]).parse[:matrix].should == [[1,2,3,4],[5,6,7,8]]
|
9
|
+
end
|
10
|
+
it 'should treat several arguments as an array composed of them' do
|
11
|
+
Parser.new([1,2,3,4],[5,6,7,8]).parse.should == Parser.new([[1,2,3,4],[5,6,7,8]]).parse
|
12
|
+
end
|
13
|
+
it 'should treat one Array of numbers as an Array(with 1 element) of Arrays' do
|
14
|
+
Parser.new([1,2,3,4]).parse.should == Parser.new([[1,2,3,4]]).parse
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context '::parse!' do
|
19
|
+
it 'should behave like Parser.new(input).parse!' do
|
20
|
+
Parser.parse!([1,2,3,4],[5,6,7,8]).should == Parser.new([1,2,3,4],[5,6,7,8]).parse!
|
21
|
+
expect{ Parser.parse!([1,2,3],[4,5,6]) }.to raise_error
|
22
|
+
end
|
23
|
+
end
|
24
|
+
context '::parse' do
|
25
|
+
it 'should behave like Parser.new(input).parse!' do
|
26
|
+
Parser.parse([1,2,3,4],[5,6,7,8]).should == Parser.new([1,2,3,4],[5,6,7,8]).parse
|
27
|
+
Parser.parse([1,2,3],[4,5,6]).should be_nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context '::normalize_hash_keys' do
|
32
|
+
it 'should convert both symbolic and string keys, in both upcase and downcase to symbolic upcases' do
|
33
|
+
Parser.normalize_hash_keys( {a: 1, C: 2, 'g' => 3, 'T' => 4} ).should == {A: 1, C: 2, G: 3, T: 4}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context '::need_transpose?' do
|
38
|
+
it 'should point whether matrix have positions(need not be transposed -- false) or letters(true) as first index' do
|
39
|
+
Parser.need_tranpose?([[1,3,5,7], [2,4,6,8]]).should be_false
|
40
|
+
Parser.need_tranpose?([[1,2],[3,4],[5,6],[7,8]]).should be_true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
context '::array_from_acgt_hash' do
|
44
|
+
it 'should convert hash of arrays to a transposed array of arrays' do
|
45
|
+
input = {A: [1,2,3], C: [2,3,4], G: [3,4,5], T: [4,5,6]}
|
46
|
+
Parser.array_from_acgt_hash(input).should == [[1,2,3], [2,3,4], [3,4,5], [4,5,6]].transpose
|
47
|
+
end
|
48
|
+
it 'should convert hash of numbers to an array of numbers' do
|
49
|
+
input = {A: 1, C: 2, G: 3, T: 4}
|
50
|
+
Parser.array_from_acgt_hash(input).should == [1,2,3,4]
|
51
|
+
end
|
52
|
+
it 'should process both symbolic and string keys, in both upcase and downcase' do
|
53
|
+
input_normal_keys = {A: 1, C: 2, G: 3, T: 4}
|
54
|
+
input_different_keys = {:A => 1, :c => 2, 'g' => 3, 'T' => 4}
|
55
|
+
Parser.array_from_acgt_hash(input_different_keys).should == Parser.array_from_acgt_hash(input_normal_keys)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context '::try_convert_to_array' do
|
60
|
+
it 'should not change array' do
|
61
|
+
inputs = []
|
62
|
+
inputs << [[1,2,3,4], [2,3,4,5], [3,4,5,6]]
|
63
|
+
inputs << [{A:1, C:2, G:3, T:4}, {A:2, C:3, G:4, T:5}, {A:3, C:4, G:5, T:6}]
|
64
|
+
inputs.each do |input|
|
65
|
+
Parser.try_convert_to_array( input ).should == input
|
66
|
+
end
|
67
|
+
end
|
68
|
+
it 'should convert ACGT-Hashes to an array of positions (not letters)' do
|
69
|
+
Parser.try_convert_to_array( {:A => [1,2,3], :c => [2,3,4], 'g' => [3,4,5], 'T' => [4,5,6]} ).should == [[1,2,3],[2,3,4],[3,4,5],[4,5,6]].transpose
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context '#parse' do
|
74
|
+
it 'should give the same result as #parse!' do
|
75
|
+
parser = Parser.new('stub parser')
|
76
|
+
parser.stub(:parse!).and_return('stub result')
|
77
|
+
parser.parse.should == 'stub result'
|
78
|
+
end
|
79
|
+
it 'should return nil if #parse! raised an exception' do
|
80
|
+
parser = Parser.new('stub parser')
|
81
|
+
parser.stub(:parse!).and_raise
|
82
|
+
parser.parse.should be_nil
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
6
86
|
good_cases = {
|
7
87
|
'Array Nx4' => {input: [[0,1,2,3],[10,11,12,13]],
|
8
88
|
matrix: [[0,1,2,3],[10,11,12,13]] },
|
@@ -30,6 +110,10 @@ module Bioinform
|
|
30
110
|
}
|
31
111
|
|
32
112
|
bad_cases = {
|
113
|
+
'Nil object on input' => {input: nil},
|
114
|
+
|
115
|
+
'Empty array on input' => {input: []},
|
116
|
+
|
33
117
|
'Different sizes of row arrays' => {input: [[1,2,3,4],[5,6,7,8,9]] },
|
34
118
|
|
35
119
|
'Different sizes of column arrays' => {input: [[0,10],[1,11],[2,12],[3]] },
|
@@ -54,5 +138,10 @@ module Bioinform
|
|
54
138
|
}
|
55
139
|
|
56
140
|
parser_specs(Parser, good_cases, bad_cases)
|
141
|
+
context '#parser!' do
|
142
|
+
it "should raise an exception on parsing empty list to parser" do
|
143
|
+
expect{ Parser.new().parse! }.to raise_error
|
144
|
+
end
|
145
|
+
end
|
57
146
|
end
|
58
147
|
end
|
@@ -5,20 +5,22 @@ module Bioinform
|
|
5
5
|
describe StringFantomParser do
|
6
6
|
good_cases = {
|
7
7
|
'string in Fantom-format' => {input: "
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
8
|
+
NA motif_CTNCAG
|
9
|
+
P0 A C G T
|
10
|
+
P1 0 1878368 0 0
|
11
|
+
P2 0 0 0 1878368
|
12
|
+
P3 469592 469592 469592 469592
|
13
|
+
P4 0 1878368 0 0
|
14
|
+
P5 1878368 0 0 0
|
15
|
+
P6 0 0 1878368 0",
|
16
|
+
matrix: [ [0.0, 1878368.0, 0.0, 0.0],
|
17
|
+
[0.0, 0.0, 0.0, 1878368.0],
|
18
|
+
[469592.0, 469592.0, 469592.0, 469592.0],
|
19
|
+
[0.0, 1878368.0, 0.0, 0.0],
|
20
|
+
[1878368.0, 0.0, 0.0, 0.0],
|
21
|
+
[0.0, 0.0, 1878368.0, 0.0]],
|
22
|
+
name: 'motif_CTNCAG'
|
23
|
+
}
|
22
24
|
}
|
23
25
|
|
24
26
|
bad_cases = { }
|
@@ -3,6 +3,52 @@ require 'bioinform/parsers/string_parser'
|
|
3
3
|
|
4
4
|
module Bioinform
|
5
5
|
describe StringParser do
|
6
|
+
|
7
|
+
describe '#each' do
|
8
|
+
it 'should yield consequent results of #parse! while it returns result' do
|
9
|
+
parser = StringParser.new("1 2 3 4\n5 6 7 8\n\n1 2 3 4\n1 2 3 4\nName\n4 3 2 1\n1 1 1 1\n0 0 0 0")
|
10
|
+
expect{|b| parser.each(&b)}.to yield_successive_args({matrix:[[1,2,3,4],[5,6,7,8]], name:nil}, {matrix:[[1,2,3,4],[1,2,3,4]], name:nil}, {matrix:[[4,3,2,1],[1,1,1,1],[0,0,0,0]], name:'Name'} )
|
11
|
+
end
|
12
|
+
it 'should restart parser from the beginning each time' do
|
13
|
+
parser = StringParser.new("1 2 3 4\n5 6 7 8\n\n1 2 3 4\n1 2 3 4\nName\n4 3 2 1\n1 1 1 1\n0 0 0 0")
|
14
|
+
3.times do
|
15
|
+
expect{|b| parser.each(&b)}.to yield_successive_args({matrix:[[1,2,3,4],[5,6,7,8]], name:nil}, {matrix:[[1,2,3,4],[1,2,3,4]], name:nil}, {matrix:[[4,3,2,1],[1,1,1,1],[0,0,0,0]], name:'Name'} )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
context '::split' do
|
21
|
+
it 'should be able to get a single PM' do
|
22
|
+
StringParser.split("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12").should == [ {matrix: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], name:nil} ]
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should be able to split several PMs separated with an empty line' do
|
26
|
+
StringParser.split("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \n\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8").should == [ {matrix:[[1,2,3,4],[5,6,7,8],[9,10,11,12]],name:nil}, {matrix:[[9,10,11,12],[1,2,3,4],[5,6,7,8]],name:nil} ]
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should be able to split several PMs separated with name' do
|
30
|
+
StringParser.split("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \nName\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8").should == [ {matrix:[[1,2,3,4],[5,6,7,8],[9,10,11,12]],name:nil}, {matrix:[[9,10,11,12],[1,2,3,4],[5,6,7,8]],name:'Name'} ]
|
31
|
+
|
32
|
+
StringParser.split("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \n\nName\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8\n\n\n").should == [ {matrix:[[1,2,3,4],[5,6,7,8],[9,10,11,12]],name:nil}, {matrix:[[9,10,11,12],[1,2,3,4],[5,6,7,8]],name:'Name'} ]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
context '::split_on_motifs' do
|
37
|
+
it 'should be able to split string into PMs' do
|
38
|
+
result = StringParser.split_on_motifs("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \nName\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8")
|
39
|
+
result.map{|pm| pm.matrix}.should == [ [[1,2,3,4],[5,6,7,8],[9,10,11,12]], [[9,10,11,12],[1,2,3,4],[5,6,7,8]] ]
|
40
|
+
result.map{|pm| pm.name}.should == [nil, 'Name']
|
41
|
+
end
|
42
|
+
it 'should create PMs by default' do
|
43
|
+
result = StringParser.split_on_motifs("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \nName\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8")
|
44
|
+
result.each{|pm| pm.class.should == PM}
|
45
|
+
end
|
46
|
+
it 'should create PM subclass when it\'s specified' do
|
47
|
+
result = StringParser.split_on_motifs("1 2 3 4 \n 5 6 7 8 \n 9 10 11 12 \nName\n 9 10 11 12 \n 1 2 3 4 \n 5 6 7 8", PWM)
|
48
|
+
result.each{|pm| pm.class.should == PWM}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
6
52
|
good_cases = {
|
7
53
|
'Nx4 string' => {input: "1 2 3 4\n5 6 7 8",
|
8
54
|
matrix: [[1,2,3,4],[5,6,7,8]] },
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'bioinform/parsers/parser'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
describe TrivialParser do
|
6
|
+
context '#initialize' do
|
7
|
+
it 'should take the only input argument' do
|
8
|
+
TrivialParser.instance_method(:initialize).arity.should == 1
|
9
|
+
end
|
10
|
+
end
|
11
|
+
context '#parser!' do
|
12
|
+
it 'should return input of that was passed to initialize' do
|
13
|
+
TrivialParser.new('stub input').parse!.should == 'stub input'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
it 'can be used to create PM with {matrix: ..., name: ...} form' do
|
17
|
+
pm = PM.new({matrix: [[1,2,3,4],[5,6,7,8]], name: 'Name'}, TrivialParser)
|
18
|
+
pm.matrix.should == [[1,2,3,4],[5,6,7,8]]
|
19
|
+
pm.name.should == 'Name'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -4,17 +4,23 @@ $LOAD_PATH.unshift File.dirname(__FILE__)
|
|
4
4
|
require 'rspec'
|
5
5
|
|
6
6
|
def parser_specs(parser_klass, good_cases, bad_cases)
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
context '#parse!' do
|
8
|
+
good_cases.each do |case_description, input_and_result|
|
9
|
+
it "should be able to parse #{case_description}" do
|
10
|
+
result = parser_klass.new(input_and_result[:input]).parse
|
11
|
+
result[:matrix].should == input_and_result[:matrix]
|
12
|
+
if input_and_result.has_key?(:name)
|
13
|
+
result[:name].should == input_and_result[:name]
|
14
|
+
else
|
15
|
+
result[:name].should be_nil
|
16
|
+
end
|
17
|
+
end
|
12
18
|
end
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
19
|
+
|
20
|
+
bad_cases.each do |case_description, input|
|
21
|
+
it "should raise an exception on parsing #{case_description}" do
|
22
|
+
expect{ parser_klass.new(input[:input]).parse! }.to raise_error
|
23
|
+
end
|
18
24
|
end
|
19
25
|
end
|
20
26
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'bioinform/support/advanced_scan'
|
3
|
+
|
4
|
+
describe StringScanner do
|
5
|
+
context '#advanced_scan' do
|
6
|
+
before do
|
7
|
+
@scanner = StringScanner.new('abcde fghIJKLmnop')
|
8
|
+
end
|
9
|
+
it 'should return nil if text doesn\'t match. Pointer should not move' do
|
10
|
+
@scanner.advanced_scan(/\s\s\s/).should be_nil
|
11
|
+
@scanner.pos.should == 0
|
12
|
+
end
|
13
|
+
it 'should return MatchData if string Matches. Pointer should move' do
|
14
|
+
@scanner.advanced_scan(/\w\w\w/).should be_kind_of MatchData
|
15
|
+
@scanner.pos.should == 3
|
16
|
+
end
|
17
|
+
it 'should return have the same groups as regexp has' do
|
18
|
+
result = @scanner.advanced_scan(/(\w+)(\s+)([a-z]+)([A-Z]+)/)
|
19
|
+
result[0].should == 'abcde fghIJKL'
|
20
|
+
result[1].should == 'abcde'
|
21
|
+
result[2].should == ' '
|
22
|
+
result[3].should == 'fgh'
|
23
|
+
result[4].should == 'IJKL'
|
24
|
+
end
|
25
|
+
it 'should return have the same named groups as regexp has' do
|
26
|
+
result = @scanner.advanced_scan(/(\w+)(\s+)(?<word_downcase>[a-z]+)(?<word_upcase>[A-Z]+)/)
|
27
|
+
result[0].should == 'abcde fghIJKL'
|
28
|
+
result[:word_downcase].should == 'fgh'
|
29
|
+
result[:word_upcase].should == 'IJKL'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -15,5 +15,11 @@ describe String do
|
|
15
15
|
it 'should preserve rows pagination' do
|
16
16
|
"abc def ghi\njk lmn".multiline_squish.should == "abc def ghi\njk lmn"
|
17
17
|
end
|
18
|
+
it 'should preserve empty lines in the middle of text' do
|
19
|
+
"abc def\n\nghi\n \t \njk lmn \n\n\n zzz".multiline_squish.should == "abc def\n\nghi\n\njk lmn\n\n\nzzz"
|
20
|
+
end
|
21
|
+
it 'should drop empty lines at begin and at end of string' do
|
22
|
+
"\n \t\n\nabc def\n\nghi\n \t \njk lmn \n\n\n zzz\n\n \t \n".multiline_squish.should == "abc def\n\nghi\n\njk lmn\n\n\nzzz"
|
23
|
+
end
|
18
24
|
end
|
19
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bioinform
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -67,7 +67,9 @@ files:
|
|
67
67
|
- lib/bioinform/parsers/parser.rb
|
68
68
|
- lib/bioinform/parsers/string_fantom_parser.rb
|
69
69
|
- lib/bioinform/parsers/string_parser.rb
|
70
|
+
- lib/bioinform/parsers/trivial_parser.rb
|
70
71
|
- lib/bioinform/support.rb
|
72
|
+
- lib/bioinform/support/advanced_scan.rb
|
71
73
|
- lib/bioinform/support/array_product.rb
|
72
74
|
- lib/bioinform/support/array_zip.rb
|
73
75
|
- lib/bioinform/support/callable_symbol.rb
|
@@ -83,11 +85,14 @@ files:
|
|
83
85
|
- lib/bioinform/version.rb
|
84
86
|
- spec/data_models/pcm_spec.rb
|
85
87
|
- spec/data_models/pm_spec.rb
|
88
|
+
- spec/data_models/ppm_spec.rb
|
86
89
|
- spec/data_models/pwm_spec.rb
|
87
90
|
- spec/parsers/parser_spec.rb
|
88
91
|
- spec/parsers/string_fantom_parser_spec.rb
|
89
92
|
- spec/parsers/string_parser_spec.rb
|
93
|
+
- spec/parsers/trivial_parser_spec.rb
|
90
94
|
- spec/spec_helper.rb
|
95
|
+
- spec/support/advanced_scan_spec.rb
|
91
96
|
- spec/support/array_product_spec.rb
|
92
97
|
- spec/support/array_zip_spec.rb
|
93
98
|
- spec/support/callable_symbol_spec.rb
|
@@ -128,11 +133,14 @@ summary: Classes for work with different input formats of positional matrices an
|
|
128
133
|
test_files:
|
129
134
|
- spec/data_models/pcm_spec.rb
|
130
135
|
- spec/data_models/pm_spec.rb
|
136
|
+
- spec/data_models/ppm_spec.rb
|
131
137
|
- spec/data_models/pwm_spec.rb
|
132
138
|
- spec/parsers/parser_spec.rb
|
133
139
|
- spec/parsers/string_fantom_parser_spec.rb
|
134
140
|
- spec/parsers/string_parser_spec.rb
|
141
|
+
- spec/parsers/trivial_parser_spec.rb
|
135
142
|
- spec/spec_helper.rb
|
143
|
+
- spec/support/advanced_scan_spec.rb
|
136
144
|
- spec/support/array_product_spec.rb
|
137
145
|
- spec/support/array_zip_spec.rb
|
138
146
|
- spec/support/callable_symbol_spec.rb
|