bio-alignment 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -4
- data/README.md +94 -9
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +75 -11
- data/features/bioruby-feature.rb +17 -0
- data/features/bioruby.feature +6 -1
- data/features/columns-feature.rb +2 -0
- data/features/edit/del_bridges-feature.rb +7 -3
- data/features/edit/del_bridges.feature +1 -2
- data/features/edit/del_non_informative_sequences-feature.rb +26 -0
- data/features/edit/del_non_informative_sequences.feature +19 -0
- data/features/edit/del_short_sequences-feature.rb +21 -0
- data/features/edit/del_short_sequences.feature +25 -0
- data/features/edit/gblocks-feature.rb +2 -2
- data/features/edit/mask_islands-feature.rb +17 -4
- data/features/edit/mask_islands.feature +28 -17
- data/features/edit/mask_serial_mutations-feature.rb +8 -6
- data/features/edit/mask_serial_mutations.feature +11 -11
- data/features/tree-feature.rb +66 -0
- data/features/tree.feature +45 -0
- data/lib/bio-alignment.rb +4 -1
- data/lib/bio-alignment/alignment.rb +58 -3
- data/lib/bio-alignment/codonsequence.rb +14 -2
- data/lib/bio-alignment/columns.rb +102 -0
- data/lib/bio-alignment/edit/del_bridges.rb +18 -1
- data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
- data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
- data/lib/bio-alignment/edit/edit_columns.rb +22 -0
- data/lib/bio-alignment/edit/edit_rows.rb +49 -0
- data/lib/bio-alignment/edit/mask_islands.rb +115 -0
- data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
- data/lib/bio-alignment/elements.rb +86 -0
- data/lib/bio-alignment/rows.rb +52 -0
- data/lib/bio-alignment/sequence.rb +20 -14
- data/lib/bio-alignment/state.rb +64 -8
- data/lib/bio-alignment/tree.rb +77 -0
- data/spec/bio-alignment_spec.rb +57 -1
- data/spec/spec_helper.rb +3 -3
- metadata +47 -22
- data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,25 @@
|
|
1
|
+
Feature: Alignment editing; remove short sequences
|
2
|
+
Remove rows that are too short (short sequences)
|
3
|
+
|
4
|
+
The dropped sequences are tracked by the row state objects
|
5
|
+
|
6
|
+
Scenario: Apply short sequence rule to an amino acid alignment
|
7
|
+
Given I have a bridged alignment
|
8
|
+
"""
|
9
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
10
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
11
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
12
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
13
|
+
-------------IFHAVR-TC-HP-----------------
|
14
|
+
"""
|
15
|
+
When I apply the short sequence rule
|
16
|
+
Then it should have removed one row
|
17
|
+
"""
|
18
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
19
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
20
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
21
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
22
|
+
"""
|
23
|
+
Then I should be able to track removed rows
|
24
|
+
|
25
|
+
|
@@ -1,5 +1,5 @@
|
|
1
1
|
When /^I apply GBlocks$/ do
|
2
|
-
pending # express the regexp above with the code you wish you had
|
2
|
+
# pending # express the regexp above with the code you wish you had
|
3
3
|
end
|
4
4
|
|
5
5
|
Then /^it should return the GBlocks cleaned alignment$/ do
|
@@ -7,7 +7,7 @@ Then /^it should return the GBlocks cleaned alignment$/ do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
Then /^return a list of removed columns$/ do
|
10
|
-
pending # express the regexp above with the code you wish you had
|
10
|
+
# pending # express the regexp above with the code you wish you had
|
11
11
|
end
|
12
12
|
|
13
13
|
|
@@ -1,12 +1,25 @@
|
|
1
1
|
require 'bio-alignment'
|
2
|
+
require 'bio-alignment/edit/mask_islands'
|
2
3
|
|
3
|
-
|
4
|
-
|
4
|
+
|
5
|
+
Given /^I have an alignment with islands$/ do |string|
|
6
|
+
@aln = Alignment.new(string.split(/\n/))
|
5
7
|
end
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
+
When /^I apply island rule with max_gap_size (\d+)$/ do |arg1|
|
10
|
+
@aln.extend MaskIslands
|
11
|
+
@marked_aln = @aln.mark_islands
|
9
12
|
end
|
10
13
|
|
14
|
+
Then /^it should have masked islands$/ do |string|
|
15
|
+
check_aln = Alignment.new(string.split(/\n/))
|
16
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked? ? Element.new("X"):e)}
|
17
|
+
new_aln.to_s.should == check_aln.to_s
|
18
|
+
end
|
11
19
|
|
20
|
+
Then /^it should also be able to delete islands$/ do |string|
|
21
|
+
check_aln = Alignment.new(string.split(/\n/))
|
22
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked? ? Element.new("-"):e)}
|
23
|
+
new_aln.to_s.should == check_aln.to_s
|
24
|
+
end
|
12
25
|
|
@@ -1,42 +1,53 @@
|
|
1
|
+
@dev
|
1
2
|
Feature: Alignment editing with the Island rule
|
2
|
-
The idea is to drop hypervariable floating sequences,
|
3
|
-
misaligned.
|
3
|
+
The idea is to drop hypervariable 'floating' sequences, or islands, as
|
4
|
+
they are possibly/probably misaligned.
|
4
5
|
|
5
|
-
Drop all 'islands' in a sequence with low
|
6
|
-
|
7
|
-
'
|
8
|
-
|
9
|
-
|
10
|
-
inside
|
11
|
-
those small gaps.
|
6
|
+
Drop all 'islands' in a sequence with low consensus, that show a gap larger
|
7
|
+
than 'min_gap_size' (default 3) on both sides, and are shorter than
|
8
|
+
'max_island_size' (default 30). An island larger than 30 elements is arguably
|
9
|
+
no longer an island, and low consensus stretches may be loops - it is up to
|
10
|
+
the alignment procedure to get that right. We also allow for micro deletions
|
11
|
+
inside an alignment (1 or 2 elements).
|
12
12
|
|
13
|
-
The island consensus is calculated by column.
|
14
|
-
|
15
|
-
|
16
|
-
element's column.
|
13
|
+
The island consensus is calculated by column. If more than 50% of the island
|
14
|
+
shows consensus, the island is retained. Consensus for each element is
|
15
|
+
defined here as the number of matches in the column (default 1).
|
17
16
|
|
18
17
|
Scenario: Apply island rule to an amino acid alignment
|
19
|
-
Given I have an alignment
|
18
|
+
Given I have an alignment with islands
|
20
19
|
"""
|
21
20
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
22
21
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
23
22
|
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
24
23
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
25
24
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
26
|
-
----------PTIIFSGCSKACSGK-----
|
25
|
+
----------PTIIFSGCSKACSGK-----FRSFRSFRSFRS
|
27
26
|
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
28
27
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
29
28
|
-------------IFHAVR-TC-HP-----------------
|
30
29
|
"""
|
31
30
|
When I apply island rule with max_gap_size 4
|
32
|
-
Then it should have
|
31
|
+
Then it should have masked islands
|
33
32
|
"""
|
34
33
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
35
34
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
36
35
|
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
37
36
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
38
37
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
39
|
-
----------PTIIFSGCSKACSGK-----
|
38
|
+
----------PTIIFSGCSKACSGK-----XXXXXXXXXXXX
|
39
|
+
----------PTIIFSGCSKACSGK-----XXXXXXXXXXXX
|
40
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
41
|
+
-------------XXXXXX-XX-XX-----------------
|
42
|
+
"""
|
43
|
+
Then it should also be able to delete islands
|
44
|
+
"""
|
45
|
+
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
46
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
47
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
48
|
+
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
49
|
+
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
50
|
+
----------PTIIFSGCSKACSGK-----------------
|
40
51
|
----------PTIIFSGCSKACSGK-----------------
|
41
52
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
42
53
|
------------------------------------------
|
@@ -1,16 +1,18 @@
|
|
1
1
|
require 'bio-alignment'
|
2
|
+
require 'bio-alignment/edit/mask_serial_mutations'
|
2
3
|
|
3
4
|
Given /^I have an alignment$/ do |string|
|
4
5
|
@aln = Alignment.new(string.split(/\n/))
|
5
|
-
p @aln
|
6
6
|
end
|
7
7
|
|
8
|
-
When /^I apply rule masking with X
|
9
|
-
|
8
|
+
When /^I apply rule masking with X$/ do
|
9
|
+
@aln.extend MaskSerialMutations
|
10
|
+
@marked_aln = @aln.mark_serial_mutations
|
10
11
|
end
|
11
12
|
|
12
|
-
Then /^
|
13
|
-
|
13
|
+
Then /^mask serial mutations should result in$/ do |string|
|
14
|
+
check_aln = Alignment.new(string.split(/\n/))
|
15
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked ? Element.new("X"):e)}
|
16
|
+
new_aln.to_s.should == check_aln.to_s
|
14
17
|
end
|
15
18
|
|
16
|
-
|
@@ -2,34 +2,34 @@ Feature: Alignment editing masking serial mutations
|
|
2
2
|
Edit an alignment removing or masking unique elements column-wise.
|
3
3
|
|
4
4
|
If a sequence has a unique AA in a column it is a single mutation event. If
|
5
|
-
multiple neighbouring AA's are also unique we suspect the sequence
|
6
|
-
outlier. This rule masks, or deletes, stretches of unique
|
7
|
-
unique AA's is defined in 'max_serial_unique' (default 5,
|
8
|
-
unique AA's are allowed).
|
5
|
+
multiple neighbouring AA's are also unique we suspect the (partial) sequence
|
6
|
+
may be an outlier. This rule masks, or deletes, stretches of totally unique
|
7
|
+
AAs. The stretch of unique AA's is defined in 'max_serial_unique' (default 5,
|
8
|
+
so two bordering unique AA's are allowed). Gaps within a series are allowed.
|
9
9
|
|
10
10
|
Scenario: Apply rule to an amino acid alignment
|
11
11
|
Given I have an alignment
|
12
12
|
"""
|
13
13
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
14
14
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
15
|
-
|
15
|
+
SSIISNSFSRPTIIFSGCSTACSQQKKTSEQVCFR---LSDV
|
16
16
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
17
17
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
18
18
|
----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
19
19
|
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
20
20
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
21
|
-
-------------
|
21
|
+
-------------TTTTTT-TT-HP-----------------
|
22
22
|
"""
|
23
|
-
When I apply rule masking with X
|
24
|
-
Then
|
23
|
+
When I apply rule masking with X
|
24
|
+
Then mask serial mutations should result in
|
25
25
|
"""
|
26
26
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
27
27
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
28
|
-
|
28
|
+
SSIISNSFSRPTIIFSGCSTACSXXXXXXXXXXFR---LSDV
|
29
29
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
30
30
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
31
|
-
----------PTIIFSGCSKACSGK-----
|
32
|
-
----------PTIIFSGCSKACSGK
|
31
|
+
----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
32
|
+
----------PTIIFSGCSKACSGK-----VCGXXXXXXSXX
|
33
33
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
34
34
|
-------------XXXXXX-XX-XX-----------------
|
35
35
|
"""
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'bio' # for the Newick tree parser
|
2
|
+
require 'bio-alignment'
|
3
|
+
|
4
|
+
Given /^I have a multiple sequence alignment \(MSA\)$/ do |string|
|
5
|
+
list = string.split(/\n/)
|
6
|
+
seqs = list.map { | line | line.split(' ')[1] }
|
7
|
+
ids = list.map { | line | line.split(' ')[0] }
|
8
|
+
@aln = Alignment.new(seqs, ids)
|
9
|
+
print @aln
|
10
|
+
end
|
11
|
+
|
12
|
+
Given /^I have a phylogenetic tree in Newick format$/ do |string|
|
13
|
+
@tree = Bio::Newick.new(string).tree
|
14
|
+
tree = @tree
|
15
|
+
tree.children(tree.root).size.should == 2
|
16
|
+
tree.descendents(tree.root).size.should == 14
|
17
|
+
tree.leaves.size.should == 8
|
18
|
+
leaf = tree.get_node_by_name('seq8')
|
19
|
+
leaf.name.should == "seq8"
|
20
|
+
tree.ancestors(leaf).size.should == 5
|
21
|
+
tree.get_edge(leaf, tree.parent(leaf)).distance.should == 1.1904755
|
22
|
+
tree.get_edge(tree.parent(leaf), tree.parent(tree.parent(leaf))).distance.should == 1.7857151
|
23
|
+
end
|
24
|
+
|
25
|
+
Then /^I should be able to traverse the tree$/ do
|
26
|
+
tree = @aln.attach_tree(@tree)
|
27
|
+
root = @aln.root # get the root of the tree
|
28
|
+
root.leaf?.should == false
|
29
|
+
children = root.children
|
30
|
+
children.map { |n| n.name }.sort.should == ["","seq7"]
|
31
|
+
seq7 = children.last
|
32
|
+
seq7.name.should == 'seq7'
|
33
|
+
seq7.leaf?.should == true
|
34
|
+
seq7.parent.should == root
|
35
|
+
seq4 = tree.find("seq4")
|
36
|
+
seq4.leaf?.should == true
|
37
|
+
seq4.distance(seq7).should == 19.387756600000003 # that is nice!
|
38
|
+
end
|
39
|
+
|
40
|
+
Then /^fetch elements from the MSA from each end node in the tree$/ do
|
41
|
+
# walk the tree
|
42
|
+
tree = @aln.attach_tree(@tree)
|
43
|
+
ids = []
|
44
|
+
column20 = tree.map { | leaf |
|
45
|
+
ids << leaf.name
|
46
|
+
seq = @aln.find(leaf.name)
|
47
|
+
# p seq
|
48
|
+
seq[19]
|
49
|
+
}
|
50
|
+
ids.should == ["seq6", "seq4", "seq8", "seq5", "seq3", "seq2", "seq1", "seq7"]
|
51
|
+
column20.should == ["K", "T", "K", "K", "T", "T", "T", "K"]
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^calculate the phylogenetic distance between each element$/ do
|
55
|
+
pending # express the regexp above with the code you wish you had
|
56
|
+
end
|
57
|
+
|
58
|
+
Then /^draw the MSA with the tree$/ do | string |
|
59
|
+
# textual drawing, like tabtree, or http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/149701
|
60
|
+
print string
|
61
|
+
pending # express the regexp above with the code you wish you had
|
62
|
+
end
|
63
|
+
|
64
|
+
Then /^draw MSA with the short tree$/ do |string|
|
65
|
+
pending # express the regexp above with the code you wish you had
|
66
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
@tree
|
2
|
+
Feature: Tree support for alignments
|
3
|
+
Alignments are often accompanied by phylogenetic trees.
|
4
|
+
|
5
|
+
Scenario: Get ordered elements from a tree
|
6
|
+
Given I have a multiple sequence alignment (MSA)
|
7
|
+
"""
|
8
|
+
seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
9
|
+
seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
10
|
+
seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
11
|
+
seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
12
|
+
seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
13
|
+
seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
14
|
+
seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
15
|
+
seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
16
|
+
"""
|
17
|
+
And I have a phylogenetic tree in Newick format
|
18
|
+
"""
|
19
|
+
((seq6:5.3571434,(seq4:4.04762,((seq8:1.1904755,seq5:1.1904755):1.7857151,((seq3:0.0,seq2:0.0):1.1904755,seq1:1.1904755):1.7857151):1.0714293):1.3095236):4.336735,seq7:9.693878);
|
20
|
+
"""
|
21
|
+
Then I should be able to traverse the tree
|
22
|
+
And fetch elements from the MSA from each end node in the tree
|
23
|
+
And calculate the phylogenetic distance between each element
|
24
|
+
And draw the MSA with the tree
|
25
|
+
"""
|
26
|
+
,--9.69----------------------------------------- seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
27
|
+
| ,--1.19----- seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
28
|
+
| ,--1.79--| ,-- seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
29
|
+
| ,--1.07--| `--1.19--+-- seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
30
|
+
| | `--1.79--+--1.19----- seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
31
|
+
| ,--1.31--| `--1.19----- seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
32
|
+
`--4.34--| `--4.05----------------------- seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
33
|
+
`--5.36-------------------------------- seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
34
|
+
"""
|
35
|
+
Then draw MSA with the short tree
|
36
|
+
"""
|
37
|
+
,----------------- seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
38
|
+
| ,----- seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
39
|
+
| ,--| ,-- seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
40
|
+
| ,--| `--+-- seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
41
|
+
| | `--+----- seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
42
|
+
| ,--| `----- seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
43
|
+
`--| `----------- seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
44
|
+
`-------------- seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
45
|
+
"""
|
data/lib/bio-alignment.rb
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
# bioruby directory tree.
|
3
3
|
#
|
4
4
|
|
5
|
-
require 'bio-alignment/
|
5
|
+
require 'bio-alignment/state'
|
6
|
+
require 'bio-alignment/elements'
|
6
7
|
require 'bio-alignment/sequence'
|
7
8
|
require 'bio-alignment/codonsequence'
|
9
|
+
require 'bio-alignment/tree'
|
10
|
+
require 'bio-alignment/alignment'
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Alignment
|
2
2
|
|
3
3
|
require 'bio-alignment/pal2nal'
|
4
|
-
require 'bio-alignment/
|
4
|
+
require 'bio-alignment/columns'
|
5
|
+
require 'bio-alignment/rows'
|
5
6
|
|
6
7
|
module Bio
|
7
8
|
|
@@ -10,6 +11,7 @@ module Bio
|
|
10
11
|
class Alignment
|
11
12
|
include Enumerable
|
12
13
|
include Pal2Nal
|
14
|
+
include Rows
|
13
15
|
include Columns
|
14
16
|
|
15
17
|
attr_accessor :sequences
|
@@ -17,16 +19,22 @@ module Bio
|
|
17
19
|
# Create alignment. seqs can be a list of sequences. If these
|
18
20
|
# are String types, they get converted to the library Sequence
|
19
21
|
# container
|
20
|
-
def initialize seqs = nil
|
22
|
+
def initialize seqs = nil, ids = nil
|
21
23
|
@sequences = []
|
22
24
|
if seqs
|
25
|
+
num = 0
|
23
26
|
seqs.each_with_index do | seq, i |
|
27
|
+
next if seq == nil or seq.to_s.strip == ""
|
28
|
+
id = num
|
29
|
+
id = ids[i] if ids and ids[i]
|
24
30
|
@sequences <<
|
25
31
|
if seq.kind_of?(String)
|
26
|
-
Sequence.new(
|
32
|
+
seq1 = Sequence.new(id,seq.strip)
|
33
|
+
seq1
|
27
34
|
else
|
28
35
|
seq
|
29
36
|
end
|
37
|
+
num += 1
|
30
38
|
end
|
31
39
|
end
|
32
40
|
end
|
@@ -39,8 +47,55 @@ module Bio
|
|
39
47
|
|
40
48
|
def each
|
41
49
|
rows.each { | seq | yield seq }
|
50
|
+
self
|
42
51
|
end
|
43
52
|
|
53
|
+
def each_element
|
54
|
+
each { |seq| seq.each { |e| yield e }}
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
def find name
|
59
|
+
each do | seq |
|
60
|
+
return seq if seq.id == name
|
61
|
+
end
|
62
|
+
raise "ERROR: Sequence not found by its name #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# clopy alignment and allow updating elements
|
66
|
+
def update_each_element
|
67
|
+
aln = self.clone
|
68
|
+
aln.each { |seq| seq.each_with_index { |e,i| seq.seq[i] = yield e }}
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_s
|
72
|
+
res = ""
|
73
|
+
res += "\t" + columns_to_s + "\n" if @columns
|
74
|
+
res += map{ |seq| seq.id.to_s + "\t" + seq.to_s }.join("\n")
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
# Return a deep cloned alignment. This method clones sequences,
|
79
|
+
# and the state objects
|
80
|
+
def clone
|
81
|
+
aln = super
|
82
|
+
# clone the sequences
|
83
|
+
aln.sequences = []
|
84
|
+
each do | seq |
|
85
|
+
aln.sequences << seq.clone
|
86
|
+
end
|
87
|
+
aln.clone_columns! if @columns
|
88
|
+
aln
|
89
|
+
end
|
90
|
+
|
91
|
+
# extend BioAlignment with Tree functionality - this method adds
|
92
|
+
# a tree and pulls in the functionality of the Tree module. Returns
|
93
|
+
# the tree traverser
|
94
|
+
def attach_tree tree
|
95
|
+
extend Tree
|
96
|
+
@tree = Tree::init(tree)
|
97
|
+
@tree
|
98
|
+
end
|
44
99
|
end
|
45
100
|
end
|
46
101
|
end
|
@@ -6,6 +6,9 @@ module Bio
|
|
6
6
|
|
7
7
|
# Codon element for the matrix, used by CodonSequence.
|
8
8
|
class Codon
|
9
|
+
GAP = '---'
|
10
|
+
UNDEFINED = 'X'
|
11
|
+
|
9
12
|
attr_reader :codon_table
|
10
13
|
|
11
14
|
def initialize codon, codon_table = 1
|
@@ -14,7 +17,7 @@ module Bio
|
|
14
17
|
end
|
15
18
|
|
16
19
|
def gap?
|
17
|
-
@codon ==
|
20
|
+
@codon == GAP
|
18
21
|
end
|
19
22
|
|
20
23
|
def undefined?
|
@@ -36,7 +39,7 @@ module Bio
|
|
36
39
|
if gap?
|
37
40
|
return '-'
|
38
41
|
elsif undefined?
|
39
|
-
return
|
42
|
+
return UNDEFINED
|
40
43
|
else
|
41
44
|
raise 'What?'
|
42
45
|
end
|
@@ -46,6 +49,7 @@ module Bio
|
|
46
49
|
|
47
50
|
private
|
48
51
|
|
52
|
+
# lazy translation of codon to amino acid
|
49
53
|
def translate
|
50
54
|
@aa ||= Bio::CodonTable[@codon_table][@codon]
|
51
55
|
@aa
|
@@ -95,6 +99,14 @@ module Bio
|
|
95
99
|
def to_aa
|
96
100
|
@seq.map { |codon| codon.to_aa }.join('')
|
97
101
|
end
|
102
|
+
|
103
|
+
def empty_copy
|
104
|
+
CodonSequence.new(@id,"")
|
105
|
+
end
|
106
|
+
|
107
|
+
def << codon
|
108
|
+
@seq << codon
|
109
|
+
end
|
98
110
|
|
99
111
|
end
|
100
112
|
|