bio-alignment 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -4
- data/README.md +94 -9
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +75 -11
- data/features/bioruby-feature.rb +17 -0
- data/features/bioruby.feature +6 -1
- data/features/columns-feature.rb +2 -0
- data/features/edit/del_bridges-feature.rb +7 -3
- data/features/edit/del_bridges.feature +1 -2
- data/features/edit/del_non_informative_sequences-feature.rb +26 -0
- data/features/edit/del_non_informative_sequences.feature +19 -0
- data/features/edit/del_short_sequences-feature.rb +21 -0
- data/features/edit/del_short_sequences.feature +25 -0
- data/features/edit/gblocks-feature.rb +2 -2
- data/features/edit/mask_islands-feature.rb +17 -4
- data/features/edit/mask_islands.feature +28 -17
- data/features/edit/mask_serial_mutations-feature.rb +8 -6
- data/features/edit/mask_serial_mutations.feature +11 -11
- data/features/tree-feature.rb +66 -0
- data/features/tree.feature +45 -0
- data/lib/bio-alignment.rb +4 -1
- data/lib/bio-alignment/alignment.rb +58 -3
- data/lib/bio-alignment/codonsequence.rb +14 -2
- data/lib/bio-alignment/columns.rb +102 -0
- data/lib/bio-alignment/edit/del_bridges.rb +18 -1
- data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
- data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
- data/lib/bio-alignment/edit/edit_columns.rb +22 -0
- data/lib/bio-alignment/edit/edit_rows.rb +49 -0
- data/lib/bio-alignment/edit/mask_islands.rb +115 -0
- data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
- data/lib/bio-alignment/elements.rb +86 -0
- data/lib/bio-alignment/rows.rb +52 -0
- data/lib/bio-alignment/sequence.rb +20 -14
- data/lib/bio-alignment/state.rb +64 -8
- data/lib/bio-alignment/tree.rb +77 -0
- data/spec/bio-alignment_spec.rb +57 -1
- data/spec/spec_helper.rb +3 -3
- metadata +47 -22
- data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'bio-alignment/state'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
|
5
|
+
module BioAlignment
|
6
|
+
|
7
|
+
# The Columns module provides accessors for the column list
|
8
|
+
# returning Column objects
|
9
|
+
module Columns
|
10
|
+
|
11
|
+
# Return a list of Column objects. The contents of the
|
12
|
+
# columns are accessed lazily
|
13
|
+
def columns
|
14
|
+
@columns ||= (0..num_columns-1).map { | col | Column.new(self,col) }
|
15
|
+
end
|
16
|
+
|
17
|
+
# def columns= list
|
18
|
+
# @columns = list
|
19
|
+
# end
|
20
|
+
|
21
|
+
def num_columns
|
22
|
+
rows.first.length
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return an alignment which match columns. The originating
|
26
|
+
# sequences should have methods 'empty_copy' and '<<'
|
27
|
+
def columns_where &block
|
28
|
+
seqs = []
|
29
|
+
rows.each do | seq |
|
30
|
+
new_seq = seq.empty_copy
|
31
|
+
seq.each_with_index do | e,i |
|
32
|
+
new_seq << e if block.call(columns[i])
|
33
|
+
end
|
34
|
+
seqs << new_seq
|
35
|
+
end
|
36
|
+
Alignment.new(seqs)
|
37
|
+
end
|
38
|
+
|
39
|
+
def columns_to_s
|
40
|
+
columns.map { |c| (c.state ? c.state.to_s : '?') }.join
|
41
|
+
end
|
42
|
+
|
43
|
+
def clone_columns!
|
44
|
+
# clone the columns
|
45
|
+
old_columns = @columns
|
46
|
+
@columns = []
|
47
|
+
old_columns.each do | old_column |
|
48
|
+
@columns << old_column.clone
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Support the notion of columns in an alignment. A column
|
54
|
+
# can have state by attaching state objects
|
55
|
+
class Column
|
56
|
+
include State
|
57
|
+
include Enumerable
|
58
|
+
|
59
|
+
def initialize aln, col
|
60
|
+
@aln = aln
|
61
|
+
@col = col
|
62
|
+
end
|
63
|
+
|
64
|
+
def [] index
|
65
|
+
@aln[index][@col]
|
66
|
+
end
|
67
|
+
|
68
|
+
# iterator fetches a column on demand, yielding column elements
|
69
|
+
def each
|
70
|
+
@aln.each do | seq |
|
71
|
+
yield seq[@col]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def length
|
76
|
+
@length ||= @aln.rows.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def count &block
|
80
|
+
counter = 0
|
81
|
+
each do | e |
|
82
|
+
found =
|
83
|
+
if e.kind_of?(String)
|
84
|
+
block.call(Element.new(e))
|
85
|
+
else
|
86
|
+
block.call(e)
|
87
|
+
end
|
88
|
+
counter += 1 if found
|
89
|
+
end
|
90
|
+
counter
|
91
|
+
end
|
92
|
+
|
93
|
+
def to_s
|
94
|
+
map{|e| e.to_s}.join('')
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
@@ -1,10 +1,27 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_columns'
|
1
2
|
|
2
3
|
module Bio
|
3
4
|
module BioAlignment
|
4
5
|
|
5
6
|
module DelBridges
|
7
|
+
include MarkColumns
|
6
8
|
|
7
|
-
|
9
|
+
# Return a new alignment with columns marked for deletion, i.e. mark
|
10
|
+
# columns that mostly contain gaps (threshold +percentage+). The
|
11
|
+
# alignment returned is a cloned copy
|
12
|
+
def mark_bridges percentage = 30
|
13
|
+
mark_columns { |state,column|
|
14
|
+
num = column.count { |e| e.gap? or e.undefined? }
|
15
|
+
if (num.to_f/column.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return an alignment with the bridges removed
|
23
|
+
def del_bridges percentage=30
|
24
|
+
mark_bridges.columns_where { |col| !col.state.deleted? }
|
8
25
|
end
|
9
26
|
end
|
10
27
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module DelNonInformativeSequences
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# Return a new alignment with rows marked for deletion, i.e. mark rows
|
10
|
+
# that mostly contain undefined elements and gaps (threshold
|
11
|
+
# +percentage+). The alignment returned is a cloned copy
|
12
|
+
def mark_non_informative_sequences percentage = 30
|
13
|
+
mark_rows { |state,row|
|
14
|
+
num = row.count { |e| e.gap? or e.undefined? }
|
15
|
+
if (num.to_f/row.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def del_non_informative_sequences percentage=30
|
23
|
+
mark_non_informative_sequences.rows_where { |row| !row.state.deleted? }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module DelShortSequences
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# Return a new alignment with rows marked for deletion, i.e. mark
|
10
|
+
# rows that mostly contain gaps (threshold +percentage+). The
|
11
|
+
# alignment returned is a cloned copy
|
12
|
+
def mark_short_sequences percentage = 30
|
13
|
+
mark_rows { |state,row|
|
14
|
+
num = row.count { |e| e.gap? }
|
15
|
+
if (num.to_f/row.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return an alignment with the bridges removed
|
23
|
+
def del_short_sequences percentage=30
|
24
|
+
mark_short_sequences.rows_where { |row| !row.state.deleted? }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Bio
|
2
|
+
module BioAlignment
|
3
|
+
|
4
|
+
module MarkColumns
|
5
|
+
def mark_columns &block
|
6
|
+
aln = self.clone
|
7
|
+
# clone column state
|
8
|
+
aln.columns.each do | column |
|
9
|
+
new_state =
|
10
|
+
if column.state
|
11
|
+
column.state.clone
|
12
|
+
else
|
13
|
+
ColumnState.new
|
14
|
+
end
|
15
|
+
column.state = block.call(new_state,column)
|
16
|
+
end
|
17
|
+
aln
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Bio
|
2
|
+
module BioAlignment
|
3
|
+
|
4
|
+
# Function for marking rows (sequences), when a row block returns the new
|
5
|
+
# state, and returning a newly cloned alignment
|
6
|
+
module MarkRows
|
7
|
+
|
8
|
+
# Mark each seq
|
9
|
+
def mark_rows &block
|
10
|
+
aln = markrows_clone
|
11
|
+
aln.rows.each do | row |
|
12
|
+
row.state = block.call(row.state,row)
|
13
|
+
end
|
14
|
+
aln
|
15
|
+
end
|
16
|
+
|
17
|
+
# allow the marking of elements in a copied alignment, making sure
|
18
|
+
# each element is a proper Element object that can contain state.
|
19
|
+
# A Sequence alignment will be turned into an Elements alignment.
|
20
|
+
def mark_row_elements &block
|
21
|
+
aln = markrows_clone
|
22
|
+
aln.rows.each_with_index do | row,rownum |
|
23
|
+
new_seq = block.call(row.to_elements,rownum)
|
24
|
+
aln.rows[rownum] = new_seq
|
25
|
+
end
|
26
|
+
aln
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def markrows_clone
|
32
|
+
aln = self.clone
|
33
|
+
# clone row state, or add a state object
|
34
|
+
aln.rows.each do | row |
|
35
|
+
new_state =
|
36
|
+
if row.state
|
37
|
+
row.state.clone
|
38
|
+
else
|
39
|
+
RowState.new
|
40
|
+
end
|
41
|
+
row.state = new_state
|
42
|
+
end
|
43
|
+
aln
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module MaskIslands
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
class IslandElementState < ElementMaskedState
|
10
|
+
attr_accessor :unique
|
11
|
+
def to_s
|
12
|
+
super + (@unique?'U':' ')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Drop all 'islands' in a sequence with low consensus, that show a gap
|
17
|
+
# larger than 'min_gap_size' (default 3) on both sides, and are shorter
|
18
|
+
# than 'max_island_size' (default 30). An island larger than 30 elements
|
19
|
+
# is arguably no longer an island, and low consensus stretches may be
|
20
|
+
# loops - it is up to the alignment procedure to get that right. We also
|
21
|
+
# allow for micro deletions inside an alignment (1 or 2 elements).
|
22
|
+
# The island consensus is calculated by column. If more than 50% of the
|
23
|
+
# island shows consensus, the island is retained. Consensus for each
|
24
|
+
# element is defined as the number of matches in the column (default 1).
|
25
|
+
def mark_islands
|
26
|
+
mark_row_elements { |row,rownum|
|
27
|
+
# first set state and find unique elements (i.e. consensus)
|
28
|
+
row.each_with_index do |e,colnum|
|
29
|
+
e.state = IslandElementState.new
|
30
|
+
column = columns[colnum]
|
31
|
+
e.state.unique = (column.count{|e2| !e2.gap? and e2 == e } == 1)
|
32
|
+
# p [e,e.state,e.state.unique]
|
33
|
+
end
|
34
|
+
# group elements into islands (split on gap) and mask
|
35
|
+
gap = []
|
36
|
+
island = []
|
37
|
+
in_island = true
|
38
|
+
row.each do |e|
|
39
|
+
if not in_island
|
40
|
+
if e.gap?
|
41
|
+
gap << e
|
42
|
+
else
|
43
|
+
island << e
|
44
|
+
in_island = true
|
45
|
+
gap = []
|
46
|
+
end
|
47
|
+
else # in_island
|
48
|
+
if not e.gap?
|
49
|
+
island << e
|
50
|
+
gap = []
|
51
|
+
else
|
52
|
+
gap << e
|
53
|
+
if gap.length > 2
|
54
|
+
in_island = false
|
55
|
+
mark_island(island)
|
56
|
+
# print_island(island)
|
57
|
+
island = []
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
if in_island
|
63
|
+
mark_island(island)
|
64
|
+
# print_island(island) if island.length > 0
|
65
|
+
end
|
66
|
+
# row.each_with_index do |e,colnum|
|
67
|
+
# e.state = ElementState.new
|
68
|
+
# column = columns[colnum]
|
69
|
+
# e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
|
70
|
+
# # print e,',',e.state,';'
|
71
|
+
# end
|
72
|
+
# now make sure there are at least 5 in a row, otherwise
|
73
|
+
# start unmasking. First group all elements
|
74
|
+
# group = []
|
75
|
+
# row.each_with_index do |e,colnum|
|
76
|
+
# next if e.gap?
|
77
|
+
# if e.state.masked?
|
78
|
+
# group << e
|
79
|
+
# else
|
80
|
+
# if group.length <= min_serial
|
81
|
+
# # the group is too small
|
82
|
+
# group.each do | e2 |
|
83
|
+
# e2.state.unmask!
|
84
|
+
# end
|
85
|
+
# end
|
86
|
+
# group = []
|
87
|
+
# end
|
88
|
+
# end
|
89
|
+
row # return changed sequence
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def mark_island island
|
96
|
+
return if island.length < 2
|
97
|
+
unique = 0
|
98
|
+
island.each do |e|
|
99
|
+
unique += 1 if e.state.unique == true
|
100
|
+
end
|
101
|
+
consensus = 1.0 - unique.to_f / island.length
|
102
|
+
# p unique, consensus
|
103
|
+
if consensus < 0.5
|
104
|
+
island.each do |e|
|
105
|
+
e.state.mask!
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def print_island island
|
111
|
+
p island.map {|e2| e2.to_s + ':' + e2.state.to_s }.join(",")
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module MaskSerialMutations
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# edit copied alignment and mark elements if they are a continuous of
|
10
|
+
# unique mutations in the alignment. The default is at least 5 mutations
|
11
|
+
# in a row.
|
12
|
+
def mark_serial_mutations min_serial=5
|
13
|
+
mark_row_elements { |row,rownum|
|
14
|
+
# if an element is unique, mask it
|
15
|
+
row.each_with_index do |e,colnum|
|
16
|
+
e.state = ElementMaskedState.new
|
17
|
+
column = columns[colnum]
|
18
|
+
e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
|
19
|
+
# print e,',',e.state,';'
|
20
|
+
end
|
21
|
+
# now make sure there are at least 5 in a row, otherwise
|
22
|
+
# start unmasking. First group all elements
|
23
|
+
group = []
|
24
|
+
row.each_with_index do |e,colnum|
|
25
|
+
next if e.gap?
|
26
|
+
if e.state.masked?
|
27
|
+
group << e
|
28
|
+
else
|
29
|
+
if group.length <= min_serial
|
30
|
+
# the group is too small
|
31
|
+
group.each do | e2 |
|
32
|
+
e2.state.unmask!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
group = []
|
36
|
+
end
|
37
|
+
end
|
38
|
+
row # return changed sequence
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module BioAlignment
|
4
|
+
|
5
|
+
# Simple element that can be queried
|
6
|
+
class Element
|
7
|
+
GAP = '-'
|
8
|
+
UNDEFINED = 'X'
|
9
|
+
include State
|
10
|
+
|
11
|
+
def initialize c
|
12
|
+
@c = c
|
13
|
+
end
|
14
|
+
def gap?
|
15
|
+
@c == GAP
|
16
|
+
end
|
17
|
+
def undefined?
|
18
|
+
@c == 'X'
|
19
|
+
end
|
20
|
+
def to_s
|
21
|
+
@c
|
22
|
+
end
|
23
|
+
def == other
|
24
|
+
to_s == other.to_s
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Elements is a container for Element sequences.
|
29
|
+
#
|
30
|
+
class Elements
|
31
|
+
include Enumerable
|
32
|
+
include State
|
33
|
+
|
34
|
+
attr_reader :id, :seq
|
35
|
+
def initialize id, seq
|
36
|
+
@id = id
|
37
|
+
@seq = []
|
38
|
+
if seq.kind_of?(Elements)
|
39
|
+
@seq = seq.clone
|
40
|
+
elsif seq.kind_of?(String)
|
41
|
+
seq.each_char do |c|
|
42
|
+
@seq << Element.new(c)
|
43
|
+
end
|
44
|
+
else
|
45
|
+
seq.each do |s|
|
46
|
+
@seq << Element.new(s)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def [] index
|
52
|
+
@seq[index]
|
53
|
+
end
|
54
|
+
|
55
|
+
def length
|
56
|
+
@seq.length
|
57
|
+
end
|
58
|
+
|
59
|
+
def each
|
60
|
+
@seq.each { |e| yield e }
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
@seq.map{|e| e.to_s }.join("")
|
65
|
+
end
|
66
|
+
|
67
|
+
def << element
|
68
|
+
@seq << element
|
69
|
+
end
|
70
|
+
|
71
|
+
def empty_copy
|
72
|
+
Elements.new(@id,"")
|
73
|
+
end
|
74
|
+
|
75
|
+
def clone
|
76
|
+
copy = Elements.new(@id,"")
|
77
|
+
@seq.each do |e|
|
78
|
+
copy << e.dup
|
79
|
+
end
|
80
|
+
copy
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|