bio-alignment 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -4
- data/README.md +94 -9
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +75 -11
- data/features/bioruby-feature.rb +17 -0
- data/features/bioruby.feature +6 -1
- data/features/columns-feature.rb +2 -0
- data/features/edit/del_bridges-feature.rb +7 -3
- data/features/edit/del_bridges.feature +1 -2
- data/features/edit/del_non_informative_sequences-feature.rb +26 -0
- data/features/edit/del_non_informative_sequences.feature +19 -0
- data/features/edit/del_short_sequences-feature.rb +21 -0
- data/features/edit/del_short_sequences.feature +25 -0
- data/features/edit/gblocks-feature.rb +2 -2
- data/features/edit/mask_islands-feature.rb +17 -4
- data/features/edit/mask_islands.feature +28 -17
- data/features/edit/mask_serial_mutations-feature.rb +8 -6
- data/features/edit/mask_serial_mutations.feature +11 -11
- data/features/tree-feature.rb +66 -0
- data/features/tree.feature +45 -0
- data/lib/bio-alignment.rb +4 -1
- data/lib/bio-alignment/alignment.rb +58 -3
- data/lib/bio-alignment/codonsequence.rb +14 -2
- data/lib/bio-alignment/columns.rb +102 -0
- data/lib/bio-alignment/edit/del_bridges.rb +18 -1
- data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
- data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
- data/lib/bio-alignment/edit/edit_columns.rb +22 -0
- data/lib/bio-alignment/edit/edit_rows.rb +49 -0
- data/lib/bio-alignment/edit/mask_islands.rb +115 -0
- data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
- data/lib/bio-alignment/elements.rb +86 -0
- data/lib/bio-alignment/rows.rb +52 -0
- data/lib/bio-alignment/sequence.rb +20 -14
- data/lib/bio-alignment/state.rb +64 -8
- data/lib/bio-alignment/tree.rb +77 -0
- data/spec/bio-alignment_spec.rb +57 -1
- data/spec/spec_helper.rb +3 -3
- metadata +47 -22
- data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,25 @@
|
|
1
|
+
Feature: Alignment editing; remove short sequences
|
2
|
+
Remove rows that are too short (short sequences)
|
3
|
+
|
4
|
+
The dropped sequences are tracked by the row state objects
|
5
|
+
|
6
|
+
Scenario: Apply short sequence rule to an amino acid alignment
|
7
|
+
Given I have a bridged alignment
|
8
|
+
"""
|
9
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
10
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
11
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
12
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
13
|
+
-------------IFHAVR-TC-HP-----------------
|
14
|
+
"""
|
15
|
+
When I apply the short sequence rule
|
16
|
+
Then it should have removed one row
|
17
|
+
"""
|
18
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
19
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
20
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
21
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
22
|
+
"""
|
23
|
+
Then I should be able to track removed rows
|
24
|
+
|
25
|
+
|
@@ -1,5 +1,5 @@
|
|
1
1
|
When /^I apply GBlocks$/ do
|
2
|
-
pending # express the regexp above with the code you wish you had
|
2
|
+
# pending # express the regexp above with the code you wish you had
|
3
3
|
end
|
4
4
|
|
5
5
|
Then /^it should return the GBlocks cleaned alignment$/ do
|
@@ -7,7 +7,7 @@ Then /^it should return the GBlocks cleaned alignment$/ do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
Then /^return a list of removed columns$/ do
|
10
|
-
pending # express the regexp above with the code you wish you had
|
10
|
+
# pending # express the regexp above with the code you wish you had
|
11
11
|
end
|
12
12
|
|
13
13
|
|
@@ -1,12 +1,25 @@
|
|
1
1
|
require 'bio-alignment'
|
2
|
+
require 'bio-alignment/edit/mask_islands'
|
2
3
|
|
3
|
-
|
4
|
-
|
4
|
+
|
5
|
+
Given /^I have an alignment with islands$/ do |string|
|
6
|
+
@aln = Alignment.new(string.split(/\n/))
|
5
7
|
end
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
+
When /^I apply island rule with max_gap_size (\d+)$/ do |arg1|
|
10
|
+
@aln.extend MaskIslands
|
11
|
+
@marked_aln = @aln.mark_islands
|
9
12
|
end
|
10
13
|
|
14
|
+
Then /^it should have masked islands$/ do |string|
|
15
|
+
check_aln = Alignment.new(string.split(/\n/))
|
16
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked? ? Element.new("X"):e)}
|
17
|
+
new_aln.to_s.should == check_aln.to_s
|
18
|
+
end
|
11
19
|
|
20
|
+
Then /^it should also be able to delete islands$/ do |string|
|
21
|
+
check_aln = Alignment.new(string.split(/\n/))
|
22
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked? ? Element.new("-"):e)}
|
23
|
+
new_aln.to_s.should == check_aln.to_s
|
24
|
+
end
|
12
25
|
|
@@ -1,42 +1,53 @@
|
|
1
|
+
@dev
|
1
2
|
Feature: Alignment editing with the Island rule
|
2
|
-
The idea is to drop hypervariable floating sequences,
|
3
|
-
misaligned.
|
3
|
+
The idea is to drop hypervariable 'floating' sequences, or islands, as
|
4
|
+
they are possibly/probably misaligned.
|
4
5
|
|
5
|
-
Drop all 'islands' in a sequence with low
|
6
|
-
|
7
|
-
'
|
8
|
-
|
9
|
-
|
10
|
-
inside
|
11
|
-
those small gaps.
|
6
|
+
Drop all 'islands' in a sequence with low consensus, that show a gap larger
|
7
|
+
than 'min_gap_size' (default 3) on both sides, and are shorter than
|
8
|
+
'max_island_size' (default 30). An island larger than 30 elements is arguably
|
9
|
+
no longer an island, and low consensus stretches may be loops - it is up to
|
10
|
+
the alignment procedure to get that right. We also allow for micro deletions
|
11
|
+
inside an alignment (1 or 2 elements).
|
12
12
|
|
13
|
-
The island consensus is calculated by column.
|
14
|
-
|
15
|
-
|
16
|
-
element's column.
|
13
|
+
The island consensus is calculated by column. If more than 50% of the island
|
14
|
+
shows consensus, the island is retained. Consensus for each element is
|
15
|
+
defined here as the number of matches in the column (default 1).
|
17
16
|
|
18
17
|
Scenario: Apply island rule to an amino acid alignment
|
19
|
-
Given I have an alignment
|
18
|
+
Given I have an alignment with islands
|
20
19
|
"""
|
21
20
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
22
21
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
23
22
|
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
24
23
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
25
24
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
26
|
-
----------PTIIFSGCSKACSGK-----
|
25
|
+
----------PTIIFSGCSKACSGK-----FRSFRSFRSFRS
|
27
26
|
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
28
27
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
29
28
|
-------------IFHAVR-TC-HP-----------------
|
30
29
|
"""
|
31
30
|
When I apply island rule with max_gap_size 4
|
32
|
-
Then it should have
|
31
|
+
Then it should have masked islands
|
33
32
|
"""
|
34
33
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
35
34
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
36
35
|
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
37
36
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
38
37
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
39
|
-
----------PTIIFSGCSKACSGK-----
|
38
|
+
----------PTIIFSGCSKACSGK-----XXXXXXXXXXXX
|
39
|
+
----------PTIIFSGCSKACSGK-----XXXXXXXXXXXX
|
40
|
+
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
41
|
+
-------------XXXXXX-XX-XX-----------------
|
42
|
+
"""
|
43
|
+
Then it should also be able to delete islands
|
44
|
+
"""
|
45
|
+
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
46
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
47
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
48
|
+
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
49
|
+
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
50
|
+
----------PTIIFSGCSKACSGK-----------------
|
40
51
|
----------PTIIFSGCSKACSGK-----------------
|
41
52
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
42
53
|
------------------------------------------
|
@@ -1,16 +1,18 @@
|
|
1
1
|
require 'bio-alignment'
|
2
|
+
require 'bio-alignment/edit/mask_serial_mutations'
|
2
3
|
|
3
4
|
Given /^I have an alignment$/ do |string|
|
4
5
|
@aln = Alignment.new(string.split(/\n/))
|
5
|
-
p @aln
|
6
6
|
end
|
7
7
|
|
8
|
-
When /^I apply rule masking with X
|
9
|
-
|
8
|
+
When /^I apply rule masking with X$/ do
|
9
|
+
@aln.extend MaskSerialMutations
|
10
|
+
@marked_aln = @aln.mark_serial_mutations
|
10
11
|
end
|
11
12
|
|
12
|
-
Then /^
|
13
|
-
|
13
|
+
Then /^mask serial mutations should result in$/ do |string|
|
14
|
+
check_aln = Alignment.new(string.split(/\n/))
|
15
|
+
new_aln = @marked_aln.update_each_element { |e| (e.state.masked ? Element.new("X"):e)}
|
16
|
+
new_aln.to_s.should == check_aln.to_s
|
14
17
|
end
|
15
18
|
|
16
|
-
|
@@ -2,34 +2,34 @@ Feature: Alignment editing masking serial mutations
|
|
2
2
|
Edit an alignment removing or masking unique elements column-wise.
|
3
3
|
|
4
4
|
If a sequence has a unique AA in a column it is a single mutation event. If
|
5
|
-
multiple neighbouring AA's are also unique we suspect the sequence
|
6
|
-
outlier. This rule masks, or deletes, stretches of unique
|
7
|
-
unique AA's is defined in 'max_serial_unique' (default 5,
|
8
|
-
unique AA's are allowed).
|
5
|
+
multiple neighbouring AA's are also unique we suspect the (partial) sequence
|
6
|
+
may be an outlier. This rule masks, or deletes, stretches of totally unique
|
7
|
+
AAs. The stretch of unique AA's is defined in 'max_serial_unique' (default 5,
|
8
|
+
so two bordering unique AA's are allowed). Gaps within a series are allowed.
|
9
9
|
|
10
10
|
Scenario: Apply rule to an amino acid alignment
|
11
11
|
Given I have an alignment
|
12
12
|
"""
|
13
13
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
14
14
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
15
|
-
|
15
|
+
SSIISNSFSRPTIIFSGCSTACSQQKKTSEQVCFR---LSDV
|
16
16
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
17
17
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
18
18
|
----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
19
19
|
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
20
20
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
21
|
-
-------------
|
21
|
+
-------------TTTTTT-TT-HP-----------------
|
22
22
|
"""
|
23
|
-
When I apply rule masking with X
|
24
|
-
Then
|
23
|
+
When I apply rule masking with X
|
24
|
+
Then mask serial mutations should result in
|
25
25
|
"""
|
26
26
|
----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
27
27
|
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
28
|
-
|
28
|
+
SSIISNSFSRPTIIFSGCSTACSXXXXXXXXXXFR---LSDV
|
29
29
|
----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
30
30
|
----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
31
|
-
----------PTIIFSGCSKACSGK-----
|
32
|
-
----------PTIIFSGCSKACSGK
|
31
|
+
----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
32
|
+
----------PTIIFSGCSKACSGK-----VCGXXXXXXSXX
|
33
33
|
----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
34
34
|
-------------XXXXXX-XX-XX-----------------
|
35
35
|
"""
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'bio' # for the Newick tree parser
|
2
|
+
require 'bio-alignment'
|
3
|
+
|
4
|
+
Given /^I have a multiple sequence alignment \(MSA\)$/ do |string|
|
5
|
+
list = string.split(/\n/)
|
6
|
+
seqs = list.map { | line | line.split(' ')[1] }
|
7
|
+
ids = list.map { | line | line.split(' ')[0] }
|
8
|
+
@aln = Alignment.new(seqs, ids)
|
9
|
+
print @aln
|
10
|
+
end
|
11
|
+
|
12
|
+
Given /^I have a phylogenetic tree in Newick format$/ do |string|
|
13
|
+
@tree = Bio::Newick.new(string).tree
|
14
|
+
tree = @tree
|
15
|
+
tree.children(tree.root).size.should == 2
|
16
|
+
tree.descendents(tree.root).size.should == 14
|
17
|
+
tree.leaves.size.should == 8
|
18
|
+
leaf = tree.get_node_by_name('seq8')
|
19
|
+
leaf.name.should == "seq8"
|
20
|
+
tree.ancestors(leaf).size.should == 5
|
21
|
+
tree.get_edge(leaf, tree.parent(leaf)).distance.should == 1.1904755
|
22
|
+
tree.get_edge(tree.parent(leaf), tree.parent(tree.parent(leaf))).distance.should == 1.7857151
|
23
|
+
end
|
24
|
+
|
25
|
+
Then /^I should be able to traverse the tree$/ do
|
26
|
+
tree = @aln.attach_tree(@tree)
|
27
|
+
root = @aln.root # get the root of the tree
|
28
|
+
root.leaf?.should == false
|
29
|
+
children = root.children
|
30
|
+
children.map { |n| n.name }.sort.should == ["","seq7"]
|
31
|
+
seq7 = children.last
|
32
|
+
seq7.name.should == 'seq7'
|
33
|
+
seq7.leaf?.should == true
|
34
|
+
seq7.parent.should == root
|
35
|
+
seq4 = tree.find("seq4")
|
36
|
+
seq4.leaf?.should == true
|
37
|
+
seq4.distance(seq7).should == 19.387756600000003 # that is nice!
|
38
|
+
end
|
39
|
+
|
40
|
+
Then /^fetch elements from the MSA from each end node in the tree$/ do
|
41
|
+
# walk the tree
|
42
|
+
tree = @aln.attach_tree(@tree)
|
43
|
+
ids = []
|
44
|
+
column20 = tree.map { | leaf |
|
45
|
+
ids << leaf.name
|
46
|
+
seq = @aln.find(leaf.name)
|
47
|
+
# p seq
|
48
|
+
seq[19]
|
49
|
+
}
|
50
|
+
ids.should == ["seq6", "seq4", "seq8", "seq5", "seq3", "seq2", "seq1", "seq7"]
|
51
|
+
column20.should == ["K", "T", "K", "K", "T", "T", "T", "K"]
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^calculate the phylogenetic distance between each element$/ do
|
55
|
+
pending # express the regexp above with the code you wish you had
|
56
|
+
end
|
57
|
+
|
58
|
+
Then /^draw the MSA with the tree$/ do | string |
|
59
|
+
# textual drawing, like tabtree, or http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/149701
|
60
|
+
print string
|
61
|
+
pending # express the regexp above with the code you wish you had
|
62
|
+
end
|
63
|
+
|
64
|
+
Then /^draw MSA with the short tree$/ do |string|
|
65
|
+
pending # express the regexp above with the code you wish you had
|
66
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
@tree
|
2
|
+
Feature: Tree support for alignments
|
3
|
+
Alignments are often accompanied by phylogenetic trees.
|
4
|
+
|
5
|
+
Scenario: Get ordered elements from a tree
|
6
|
+
Given I have a multiple sequence alignment (MSA)
|
7
|
+
"""
|
8
|
+
seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
9
|
+
seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
10
|
+
seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
11
|
+
seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
12
|
+
seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
13
|
+
seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
14
|
+
seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
15
|
+
seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
16
|
+
"""
|
17
|
+
And I have a phylogenetic tree in Newick format
|
18
|
+
"""
|
19
|
+
((seq6:5.3571434,(seq4:4.04762,((seq8:1.1904755,seq5:1.1904755):1.7857151,((seq3:0.0,seq2:0.0):1.1904755,seq1:1.1904755):1.7857151):1.0714293):1.3095236):4.336735,seq7:9.693878);
|
20
|
+
"""
|
21
|
+
Then I should be able to traverse the tree
|
22
|
+
And fetch elements from the MSA from each end node in the tree
|
23
|
+
And calculate the phylogenetic distance between each element
|
24
|
+
And draw the MSA with the tree
|
25
|
+
"""
|
26
|
+
,--9.69----------------------------------------- seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
27
|
+
| ,--1.19----- seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
28
|
+
| ,--1.79--| ,-- seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
29
|
+
| ,--1.07--| `--1.19--+-- seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
30
|
+
| | `--1.79--+--1.19----- seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
31
|
+
| ,--1.31--| `--1.19----- seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
32
|
+
`--4.34--| `--4.05----------------------- seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
33
|
+
`--5.36-------------------------------- seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
34
|
+
"""
|
35
|
+
Then draw MSA with the short tree
|
36
|
+
"""
|
37
|
+
,----------------- seq7 ----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
38
|
+
| ,----- seq1 ----SNSFSRPTIIFSGCSTACSGK--SELVCGFRSFMLSDV
|
39
|
+
| ,--| ,-- seq2 SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
40
|
+
| ,--| `--+-- seq3 SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
41
|
+
| | `--+----- seq5 ----------PTIIFSGCSKACSGKGLSELVCGFRSFMLSDV
|
42
|
+
| ,--| `----- seq8 ----------PTIIFSGCSKACSGK--SELVCGFRSFMLSAV
|
43
|
+
`--| `----------- seq4 ----PKLFSRPTIIFSGCSTACSGK--SEPVCGFRSFMLSDV
|
44
|
+
`-------------- seq6 ----------PTIIFSGCSKACSGK-----FRSFRSFMLSAV
|
45
|
+
"""
|
data/lib/bio-alignment.rb
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
# bioruby directory tree.
|
3
3
|
#
|
4
4
|
|
5
|
-
require 'bio-alignment/
|
5
|
+
require 'bio-alignment/state'
|
6
|
+
require 'bio-alignment/elements'
|
6
7
|
require 'bio-alignment/sequence'
|
7
8
|
require 'bio-alignment/codonsequence'
|
9
|
+
require 'bio-alignment/tree'
|
10
|
+
require 'bio-alignment/alignment'
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Alignment
|
2
2
|
|
3
3
|
require 'bio-alignment/pal2nal'
|
4
|
-
require 'bio-alignment/
|
4
|
+
require 'bio-alignment/columns'
|
5
|
+
require 'bio-alignment/rows'
|
5
6
|
|
6
7
|
module Bio
|
7
8
|
|
@@ -10,6 +11,7 @@ module Bio
|
|
10
11
|
class Alignment
|
11
12
|
include Enumerable
|
12
13
|
include Pal2Nal
|
14
|
+
include Rows
|
13
15
|
include Columns
|
14
16
|
|
15
17
|
attr_accessor :sequences
|
@@ -17,16 +19,22 @@ module Bio
|
|
17
19
|
# Create alignment. seqs can be a list of sequences. If these
|
18
20
|
# are String types, they get converted to the library Sequence
|
19
21
|
# container
|
20
|
-
def initialize seqs = nil
|
22
|
+
def initialize seqs = nil, ids = nil
|
21
23
|
@sequences = []
|
22
24
|
if seqs
|
25
|
+
num = 0
|
23
26
|
seqs.each_with_index do | seq, i |
|
27
|
+
next if seq == nil or seq.to_s.strip == ""
|
28
|
+
id = num
|
29
|
+
id = ids[i] if ids and ids[i]
|
24
30
|
@sequences <<
|
25
31
|
if seq.kind_of?(String)
|
26
|
-
Sequence.new(
|
32
|
+
seq1 = Sequence.new(id,seq.strip)
|
33
|
+
seq1
|
27
34
|
else
|
28
35
|
seq
|
29
36
|
end
|
37
|
+
num += 1
|
30
38
|
end
|
31
39
|
end
|
32
40
|
end
|
@@ -39,8 +47,55 @@ module Bio
|
|
39
47
|
|
40
48
|
def each
|
41
49
|
rows.each { | seq | yield seq }
|
50
|
+
self
|
42
51
|
end
|
43
52
|
|
53
|
+
def each_element
|
54
|
+
each { |seq| seq.each { |e| yield e }}
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
def find name
|
59
|
+
each do | seq |
|
60
|
+
return seq if seq.id == name
|
61
|
+
end
|
62
|
+
raise "ERROR: Sequence not found by its name #{name}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# clopy alignment and allow updating elements
|
66
|
+
def update_each_element
|
67
|
+
aln = self.clone
|
68
|
+
aln.each { |seq| seq.each_with_index { |e,i| seq.seq[i] = yield e }}
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_s
|
72
|
+
res = ""
|
73
|
+
res += "\t" + columns_to_s + "\n" if @columns
|
74
|
+
res += map{ |seq| seq.id.to_s + "\t" + seq.to_s }.join("\n")
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
# Return a deep cloned alignment. This method clones sequences,
|
79
|
+
# and the state objects
|
80
|
+
def clone
|
81
|
+
aln = super
|
82
|
+
# clone the sequences
|
83
|
+
aln.sequences = []
|
84
|
+
each do | seq |
|
85
|
+
aln.sequences << seq.clone
|
86
|
+
end
|
87
|
+
aln.clone_columns! if @columns
|
88
|
+
aln
|
89
|
+
end
|
90
|
+
|
91
|
+
# extend BioAlignment with Tree functionality - this method adds
|
92
|
+
# a tree and pulls in the functionality of the Tree module. Returns
|
93
|
+
# the tree traverser
|
94
|
+
def attach_tree tree
|
95
|
+
extend Tree
|
96
|
+
@tree = Tree::init(tree)
|
97
|
+
@tree
|
98
|
+
end
|
44
99
|
end
|
45
100
|
end
|
46
101
|
end
|
@@ -6,6 +6,9 @@ module Bio
|
|
6
6
|
|
7
7
|
# Codon element for the matrix, used by CodonSequence.
|
8
8
|
class Codon
|
9
|
+
GAP = '---'
|
10
|
+
UNDEFINED = 'X'
|
11
|
+
|
9
12
|
attr_reader :codon_table
|
10
13
|
|
11
14
|
def initialize codon, codon_table = 1
|
@@ -14,7 +17,7 @@ module Bio
|
|
14
17
|
end
|
15
18
|
|
16
19
|
def gap?
|
17
|
-
@codon ==
|
20
|
+
@codon == GAP
|
18
21
|
end
|
19
22
|
|
20
23
|
def undefined?
|
@@ -36,7 +39,7 @@ module Bio
|
|
36
39
|
if gap?
|
37
40
|
return '-'
|
38
41
|
elsif undefined?
|
39
|
-
return
|
42
|
+
return UNDEFINED
|
40
43
|
else
|
41
44
|
raise 'What?'
|
42
45
|
end
|
@@ -46,6 +49,7 @@ module Bio
|
|
46
49
|
|
47
50
|
private
|
48
51
|
|
52
|
+
# lazy translation of codon to amino acid
|
49
53
|
def translate
|
50
54
|
@aa ||= Bio::CodonTable[@codon_table][@codon]
|
51
55
|
@aa
|
@@ -95,6 +99,14 @@ module Bio
|
|
95
99
|
def to_aa
|
96
100
|
@seq.map { |codon| codon.to_aa }.join('')
|
97
101
|
end
|
102
|
+
|
103
|
+
def empty_copy
|
104
|
+
CodonSequence.new(@id,"")
|
105
|
+
end
|
106
|
+
|
107
|
+
def << codon
|
108
|
+
@seq << codon
|
109
|
+
end
|
98
110
|
|
99
111
|
end
|
100
112
|
|