bio-alignment 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -4
- data/README.md +94 -9
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +75 -11
- data/features/bioruby-feature.rb +17 -0
- data/features/bioruby.feature +6 -1
- data/features/columns-feature.rb +2 -0
- data/features/edit/del_bridges-feature.rb +7 -3
- data/features/edit/del_bridges.feature +1 -2
- data/features/edit/del_non_informative_sequences-feature.rb +26 -0
- data/features/edit/del_non_informative_sequences.feature +19 -0
- data/features/edit/del_short_sequences-feature.rb +21 -0
- data/features/edit/del_short_sequences.feature +25 -0
- data/features/edit/gblocks-feature.rb +2 -2
- data/features/edit/mask_islands-feature.rb +17 -4
- data/features/edit/mask_islands.feature +28 -17
- data/features/edit/mask_serial_mutations-feature.rb +8 -6
- data/features/edit/mask_serial_mutations.feature +11 -11
- data/features/tree-feature.rb +66 -0
- data/features/tree.feature +45 -0
- data/lib/bio-alignment.rb +4 -1
- data/lib/bio-alignment/alignment.rb +58 -3
- data/lib/bio-alignment/codonsequence.rb +14 -2
- data/lib/bio-alignment/columns.rb +102 -0
- data/lib/bio-alignment/edit/del_bridges.rb +18 -1
- data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
- data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
- data/lib/bio-alignment/edit/edit_columns.rb +22 -0
- data/lib/bio-alignment/edit/edit_rows.rb +49 -0
- data/lib/bio-alignment/edit/mask_islands.rb +115 -0
- data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
- data/lib/bio-alignment/elements.rb +86 -0
- data/lib/bio-alignment/rows.rb +52 -0
- data/lib/bio-alignment/sequence.rb +20 -14
- data/lib/bio-alignment/state.rb +64 -8
- data/lib/bio-alignment/tree.rb +77 -0
- data/spec/bio-alignment_spec.rb +57 -1
- data/spec/spec_helper.rb +3 -3
- metadata +47 -22
- data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'bio-alignment/state'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
|
5
|
+
module BioAlignment
|
6
|
+
|
7
|
+
# The Columns module provides accessors for the column list
|
8
|
+
# returning Column objects
|
9
|
+
module Columns
|
10
|
+
|
11
|
+
# Return a list of Column objects. The contents of the
|
12
|
+
# columns are accessed lazily
|
13
|
+
def columns
|
14
|
+
@columns ||= (0..num_columns-1).map { | col | Column.new(self,col) }
|
15
|
+
end
|
16
|
+
|
17
|
+
# def columns= list
|
18
|
+
# @columns = list
|
19
|
+
# end
|
20
|
+
|
21
|
+
def num_columns
|
22
|
+
rows.first.length
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return an alignment which match columns. The originating
|
26
|
+
# sequences should have methods 'empty_copy' and '<<'
|
27
|
+
def columns_where &block
|
28
|
+
seqs = []
|
29
|
+
rows.each do | seq |
|
30
|
+
new_seq = seq.empty_copy
|
31
|
+
seq.each_with_index do | e,i |
|
32
|
+
new_seq << e if block.call(columns[i])
|
33
|
+
end
|
34
|
+
seqs << new_seq
|
35
|
+
end
|
36
|
+
Alignment.new(seqs)
|
37
|
+
end
|
38
|
+
|
39
|
+
def columns_to_s
|
40
|
+
columns.map { |c| (c.state ? c.state.to_s : '?') }.join
|
41
|
+
end
|
42
|
+
|
43
|
+
def clone_columns!
|
44
|
+
# clone the columns
|
45
|
+
old_columns = @columns
|
46
|
+
@columns = []
|
47
|
+
old_columns.each do | old_column |
|
48
|
+
@columns << old_column.clone
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Support the notion of columns in an alignment. A column
|
54
|
+
# can have state by attaching state objects
|
55
|
+
class Column
|
56
|
+
include State
|
57
|
+
include Enumerable
|
58
|
+
|
59
|
+
def initialize aln, col
|
60
|
+
@aln = aln
|
61
|
+
@col = col
|
62
|
+
end
|
63
|
+
|
64
|
+
def [] index
|
65
|
+
@aln[index][@col]
|
66
|
+
end
|
67
|
+
|
68
|
+
# iterator fetches a column on demand, yielding column elements
|
69
|
+
def each
|
70
|
+
@aln.each do | seq |
|
71
|
+
yield seq[@col]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def length
|
76
|
+
@length ||= @aln.rows.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def count &block
|
80
|
+
counter = 0
|
81
|
+
each do | e |
|
82
|
+
found =
|
83
|
+
if e.kind_of?(String)
|
84
|
+
block.call(Element.new(e))
|
85
|
+
else
|
86
|
+
block.call(e)
|
87
|
+
end
|
88
|
+
counter += 1 if found
|
89
|
+
end
|
90
|
+
counter
|
91
|
+
end
|
92
|
+
|
93
|
+
def to_s
|
94
|
+
map{|e| e.to_s}.join('')
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
@@ -1,10 +1,27 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_columns'
|
1
2
|
|
2
3
|
module Bio
|
3
4
|
module BioAlignment
|
4
5
|
|
5
6
|
module DelBridges
|
7
|
+
include MarkColumns
|
6
8
|
|
7
|
-
|
9
|
+
# Return a new alignment with columns marked for deletion, i.e. mark
|
10
|
+
# columns that mostly contain gaps (threshold +percentage+). The
|
11
|
+
# alignment returned is a cloned copy
|
12
|
+
def mark_bridges percentage = 30
|
13
|
+
mark_columns { |state,column|
|
14
|
+
num = column.count { |e| e.gap? or e.undefined? }
|
15
|
+
if (num.to_f/column.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return an alignment with the bridges removed
|
23
|
+
def del_bridges percentage=30
|
24
|
+
mark_bridges.columns_where { |col| !col.state.deleted? }
|
8
25
|
end
|
9
26
|
end
|
10
27
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module DelNonInformativeSequences
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# Return a new alignment with rows marked for deletion, i.e. mark rows
|
10
|
+
# that mostly contain undefined elements and gaps (threshold
|
11
|
+
# +percentage+). The alignment returned is a cloned copy
|
12
|
+
def mark_non_informative_sequences percentage = 30
|
13
|
+
mark_rows { |state,row|
|
14
|
+
num = row.count { |e| e.gap? or e.undefined? }
|
15
|
+
if (num.to_f/row.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def del_non_informative_sequences percentage=30
|
23
|
+
mark_non_informative_sequences.rows_where { |row| !row.state.deleted? }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module DelShortSequences
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# Return a new alignment with rows marked for deletion, i.e. mark
|
10
|
+
# rows that mostly contain gaps (threshold +percentage+). The
|
11
|
+
# alignment returned is a cloned copy
|
12
|
+
def mark_short_sequences percentage = 30
|
13
|
+
mark_rows { |state,row|
|
14
|
+
num = row.count { |e| e.gap? }
|
15
|
+
if (num.to_f/row.length) > 1.0-percentage/100.0
|
16
|
+
state.delete!
|
17
|
+
end
|
18
|
+
state
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return an alignment with the bridges removed
|
23
|
+
def del_short_sequences percentage=30
|
24
|
+
mark_short_sequences.rows_where { |row| !row.state.deleted? }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Bio
|
2
|
+
module BioAlignment
|
3
|
+
|
4
|
+
module MarkColumns
|
5
|
+
def mark_columns &block
|
6
|
+
aln = self.clone
|
7
|
+
# clone column state
|
8
|
+
aln.columns.each do | column |
|
9
|
+
new_state =
|
10
|
+
if column.state
|
11
|
+
column.state.clone
|
12
|
+
else
|
13
|
+
ColumnState.new
|
14
|
+
end
|
15
|
+
column.state = block.call(new_state,column)
|
16
|
+
end
|
17
|
+
aln
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Bio
|
2
|
+
module BioAlignment
|
3
|
+
|
4
|
+
# Function for marking rows (sequences), when a row block returns the new
|
5
|
+
# state, and returning a newly cloned alignment
|
6
|
+
module MarkRows
|
7
|
+
|
8
|
+
# Mark each seq
|
9
|
+
def mark_rows &block
|
10
|
+
aln = markrows_clone
|
11
|
+
aln.rows.each do | row |
|
12
|
+
row.state = block.call(row.state,row)
|
13
|
+
end
|
14
|
+
aln
|
15
|
+
end
|
16
|
+
|
17
|
+
# allow the marking of elements in a copied alignment, making sure
|
18
|
+
# each element is a proper Element object that can contain state.
|
19
|
+
# A Sequence alignment will be turned into an Elements alignment.
|
20
|
+
def mark_row_elements &block
|
21
|
+
aln = markrows_clone
|
22
|
+
aln.rows.each_with_index do | row,rownum |
|
23
|
+
new_seq = block.call(row.to_elements,rownum)
|
24
|
+
aln.rows[rownum] = new_seq
|
25
|
+
end
|
26
|
+
aln
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def markrows_clone
|
32
|
+
aln = self.clone
|
33
|
+
# clone row state, or add a state object
|
34
|
+
aln.rows.each do | row |
|
35
|
+
new_state =
|
36
|
+
if row.state
|
37
|
+
row.state.clone
|
38
|
+
else
|
39
|
+
RowState.new
|
40
|
+
end
|
41
|
+
row.state = new_state
|
42
|
+
end
|
43
|
+
aln
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module MaskIslands
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
class IslandElementState < ElementMaskedState
|
10
|
+
attr_accessor :unique
|
11
|
+
def to_s
|
12
|
+
super + (@unique?'U':' ')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Drop all 'islands' in a sequence with low consensus, that show a gap
|
17
|
+
# larger than 'min_gap_size' (default 3) on both sides, and are shorter
|
18
|
+
# than 'max_island_size' (default 30). An island larger than 30 elements
|
19
|
+
# is arguably no longer an island, and low consensus stretches may be
|
20
|
+
# loops - it is up to the alignment procedure to get that right. We also
|
21
|
+
# allow for micro deletions inside an alignment (1 or 2 elements).
|
22
|
+
# The island consensus is calculated by column. If more than 50% of the
|
23
|
+
# island shows consensus, the island is retained. Consensus for each
|
24
|
+
# element is defined as the number of matches in the column (default 1).
|
25
|
+
def mark_islands
|
26
|
+
mark_row_elements { |row,rownum|
|
27
|
+
# first set state and find unique elements (i.e. consensus)
|
28
|
+
row.each_with_index do |e,colnum|
|
29
|
+
e.state = IslandElementState.new
|
30
|
+
column = columns[colnum]
|
31
|
+
e.state.unique = (column.count{|e2| !e2.gap? and e2 == e } == 1)
|
32
|
+
# p [e,e.state,e.state.unique]
|
33
|
+
end
|
34
|
+
# group elements into islands (split on gap) and mask
|
35
|
+
gap = []
|
36
|
+
island = []
|
37
|
+
in_island = true
|
38
|
+
row.each do |e|
|
39
|
+
if not in_island
|
40
|
+
if e.gap?
|
41
|
+
gap << e
|
42
|
+
else
|
43
|
+
island << e
|
44
|
+
in_island = true
|
45
|
+
gap = []
|
46
|
+
end
|
47
|
+
else # in_island
|
48
|
+
if not e.gap?
|
49
|
+
island << e
|
50
|
+
gap = []
|
51
|
+
else
|
52
|
+
gap << e
|
53
|
+
if gap.length > 2
|
54
|
+
in_island = false
|
55
|
+
mark_island(island)
|
56
|
+
# print_island(island)
|
57
|
+
island = []
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
if in_island
|
63
|
+
mark_island(island)
|
64
|
+
# print_island(island) if island.length > 0
|
65
|
+
end
|
66
|
+
# row.each_with_index do |e,colnum|
|
67
|
+
# e.state = ElementState.new
|
68
|
+
# column = columns[colnum]
|
69
|
+
# e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
|
70
|
+
# # print e,',',e.state,';'
|
71
|
+
# end
|
72
|
+
# now make sure there are at least 5 in a row, otherwise
|
73
|
+
# start unmasking. First group all elements
|
74
|
+
# group = []
|
75
|
+
# row.each_with_index do |e,colnum|
|
76
|
+
# next if e.gap?
|
77
|
+
# if e.state.masked?
|
78
|
+
# group << e
|
79
|
+
# else
|
80
|
+
# if group.length <= min_serial
|
81
|
+
# # the group is too small
|
82
|
+
# group.each do | e2 |
|
83
|
+
# e2.state.unmask!
|
84
|
+
# end
|
85
|
+
# end
|
86
|
+
# group = []
|
87
|
+
# end
|
88
|
+
# end
|
89
|
+
row # return changed sequence
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def mark_island island
|
96
|
+
return if island.length < 2
|
97
|
+
unique = 0
|
98
|
+
island.each do |e|
|
99
|
+
unique += 1 if e.state.unique == true
|
100
|
+
end
|
101
|
+
consensus = 1.0 - unique.to_f / island.length
|
102
|
+
# p unique, consensus
|
103
|
+
if consensus < 0.5
|
104
|
+
island.each do |e|
|
105
|
+
e.state.mask!
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def print_island island
|
111
|
+
p island.map {|e2| e2.to_s + ':' + e2.state.to_s }.join(",")
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'bio-alignment/edit/edit_rows'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
module MaskSerialMutations
|
7
|
+
include MarkRows
|
8
|
+
|
9
|
+
# edit copied alignment and mark elements if they are a continuous of
|
10
|
+
# unique mutations in the alignment. The default is at least 5 mutations
|
11
|
+
# in a row.
|
12
|
+
def mark_serial_mutations min_serial=5
|
13
|
+
mark_row_elements { |row,rownum|
|
14
|
+
# if an element is unique, mask it
|
15
|
+
row.each_with_index do |e,colnum|
|
16
|
+
e.state = ElementMaskedState.new
|
17
|
+
column = columns[colnum]
|
18
|
+
e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
|
19
|
+
# print e,',',e.state,';'
|
20
|
+
end
|
21
|
+
# now make sure there are at least 5 in a row, otherwise
|
22
|
+
# start unmasking. First group all elements
|
23
|
+
group = []
|
24
|
+
row.each_with_index do |e,colnum|
|
25
|
+
next if e.gap?
|
26
|
+
if e.state.masked?
|
27
|
+
group << e
|
28
|
+
else
|
29
|
+
if group.length <= min_serial
|
30
|
+
# the group is too small
|
31
|
+
group.each do | e2 |
|
32
|
+
e2.state.unmask!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
group = []
|
36
|
+
end
|
37
|
+
end
|
38
|
+
row # return changed sequence
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module BioAlignment
|
4
|
+
|
5
|
+
# Simple element that can be queried
|
6
|
+
class Element
|
7
|
+
GAP = '-'
|
8
|
+
UNDEFINED = 'X'
|
9
|
+
include State
|
10
|
+
|
11
|
+
def initialize c
|
12
|
+
@c = c
|
13
|
+
end
|
14
|
+
def gap?
|
15
|
+
@c == GAP
|
16
|
+
end
|
17
|
+
def undefined?
|
18
|
+
@c == 'X'
|
19
|
+
end
|
20
|
+
def to_s
|
21
|
+
@c
|
22
|
+
end
|
23
|
+
def == other
|
24
|
+
to_s == other.to_s
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Elements is a container for Element sequences.
|
29
|
+
#
|
30
|
+
class Elements
|
31
|
+
include Enumerable
|
32
|
+
include State
|
33
|
+
|
34
|
+
attr_reader :id, :seq
|
35
|
+
def initialize id, seq
|
36
|
+
@id = id
|
37
|
+
@seq = []
|
38
|
+
if seq.kind_of?(Elements)
|
39
|
+
@seq = seq.clone
|
40
|
+
elsif seq.kind_of?(String)
|
41
|
+
seq.each_char do |c|
|
42
|
+
@seq << Element.new(c)
|
43
|
+
end
|
44
|
+
else
|
45
|
+
seq.each do |s|
|
46
|
+
@seq << Element.new(s)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def [] index
|
52
|
+
@seq[index]
|
53
|
+
end
|
54
|
+
|
55
|
+
def length
|
56
|
+
@seq.length
|
57
|
+
end
|
58
|
+
|
59
|
+
def each
|
60
|
+
@seq.each { |e| yield e }
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
@seq.map{|e| e.to_s }.join("")
|
65
|
+
end
|
66
|
+
|
67
|
+
def << element
|
68
|
+
@seq << element
|
69
|
+
end
|
70
|
+
|
71
|
+
def empty_copy
|
72
|
+
Elements.new(@id,"")
|
73
|
+
end
|
74
|
+
|
75
|
+
def clone
|
76
|
+
copy = Elements.new(@id,"")
|
77
|
+
@seq.each do |e|
|
78
|
+
copy << e.dup
|
79
|
+
end
|
80
|
+
copy
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|