bio-alignment 0.0.1.alpha → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -4
- data/README.md +73 -4
- data/Rakefile +5 -6
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +101 -9
- data/features/codon-feature.rb +48 -0
- data/features/codon.feature +14 -0
- data/features/pal2nal-feature.rb +44 -0
- data/features/pal2nal.feature +10 -0
- data/lib/bio-alignment.rb +2 -0
- data/lib/bio-alignment/alignment.rb +22 -0
- data/lib/bio-alignment/codonsequence.rb +87 -0
- data/lib/bio-alignment/sequence.rb +39 -0
- data/spec/bio-alignment_spec.rb +27 -2
- data/test/data/fasta/codon/aa-alignment.fa +84 -0
- data/test/data/fasta/codon/codon-alignment.fa +264 -0
- data/test/data/fasta/codon/nt.fa +24 -0
- data/test/data/regression/aa-aln.fa +24 -0
- data/test/data/regression/nt-aln.fa +24 -0
- data/test/data/regression/pal2nal.fa +24 -0
- metadata +71 -27
data/Gemfile
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
|
3
|
-
#
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
2
|
+
gem "bio-logger"
|
3
|
+
gem "bio", ">= 1.4.2" # for translation tables
|
5
4
|
|
6
5
|
# Add dependencies to develop your gem here.
|
7
6
|
# Include everything needed to run rake, tests, features, etc.
|
8
7
|
group :development do
|
8
|
+
gem "bio-bigbio", "> 0.1.3" # for FASTA files in tests
|
9
|
+
gem "cucumber", ">= 0"
|
9
10
|
gem "rspec", "~> 2.3.0"
|
10
11
|
gem "bundler", "~> 1.0.0"
|
11
12
|
gem "jeweler", "~> 1.7.0"
|
12
|
-
gem "bio", ">= 1.4.2"
|
13
13
|
end
|
data/README.md
CHANGED
@@ -9,20 +9,89 @@ nucleotide quality score, codon annotation). The only requirement is
|
|
9
9
|
that the list is iterable and can be indexed.
|
10
10
|
|
11
11
|
This work is based on Pjotr's experience designing the BioScala
|
12
|
-
Alignment handler and BioRuby's PAML support.
|
13
|
-
|
12
|
+
Alignment handler and BioRuby's PAML support. Read the
|
13
|
+
Bio::BioAlignment
|
14
|
+
[design
|
15
|
+
document](https://github.com/pjotrp/bioruby-alignment/blob/master/doc/bio-alignment-design.md)
|
16
|
+
for Ruby.
|
14
17
|
|
15
18
|
Note: this software is under active development.
|
16
19
|
|
17
20
|
## Developers
|
18
21
|
|
19
|
-
|
22
|
+
### Codon alignment example
|
23
|
+
|
24
|
+
To use the library, load aligned sequences into the Alignment
|
25
|
+
matrix. Here we write an amino acid alignment from a codon
|
26
|
+
aligmment (note codon gaps are represented by '---')
|
20
27
|
|
21
28
|
```ruby
|
22
29
|
require 'bio-alignment'
|
30
|
+
require 'bigbio' # Fasta reader and writer
|
31
|
+
|
32
|
+
aln = Alignment.new
|
33
|
+
fasta = FastaReader.new('codon-alignment.fa')
|
34
|
+
fasta.each do | rec |
|
35
|
+
aln.sequences << CodonSequence.new(rec.id, rec.seq)
|
36
|
+
end
|
37
|
+
# write a matching amino acid alignment
|
38
|
+
fasta = FastaWriter.new('aa-aln.fa')
|
39
|
+
aln.rows.each do | row |
|
40
|
+
fasta.write(row.id, row.to_aa.to_s)
|
41
|
+
end
|
42
|
+
```
|
43
|
+
|
44
|
+
### Pal2nal
|
45
|
+
|
46
|
+
A protein (amino acid) to nucleotide alignment would first load
|
47
|
+
the sequences
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
aln1 = Alignment.new
|
51
|
+
fasta1 = FastaWriter.new('aa-aln.fa')
|
52
|
+
aln1.rows.each do | row |
|
53
|
+
fasta1.write(row.id, row.to_aa.to_s)
|
54
|
+
end
|
55
|
+
aln2 = Alignment.new
|
56
|
+
fasta2 = FastaReader.new('nt.fa')
|
57
|
+
fasta2.each do | rec |
|
58
|
+
aln2.sequences << Sequence.new(rec.id, rec.seq)
|
59
|
+
end
|
60
|
+
```
|
61
|
+
|
62
|
+
Write a (simple) version of pal2nal would be something like
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
fasta3 = FastaWriter.new('nt-aln.fa')
|
66
|
+
aln.each_with_index do | aaseq, i |
|
67
|
+
ntseq = aln2.sequences[i]
|
68
|
+
aaseq.id.should == ntseq.id
|
69
|
+
codonseq = CodonSequence.new(ntseq.id, ntseq.seq)
|
70
|
+
codon_pos = 0
|
71
|
+
result = []
|
72
|
+
aaseq.each do | aa |
|
73
|
+
result <<
|
74
|
+
if aa.gap?
|
75
|
+
'---'
|
76
|
+
else
|
77
|
+
codon_pos += 1
|
78
|
+
codonseq[codon_pos-1].to_s
|
79
|
+
end
|
80
|
+
end
|
81
|
+
fasta3.write(aaseq.id, result.join(''))
|
82
|
+
end
|
83
|
+
```
|
84
|
+
|
85
|
+
With aln1 and aln2, the library version is the shorter
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
aln3 = aln1.pal2nal(aln2)
|
89
|
+
fasta3 = FastaWriter.new('nt-aln.fa')
|
90
|
+
aln3.each { | rec | fasta3.write(rec) }
|
23
91
|
```
|
24
92
|
|
25
|
-
The API
|
93
|
+
The API documentation is online. For more code examples see ./spec/*.rb and
|
94
|
+
./features/*
|
26
95
|
|
27
96
|
## Cite
|
28
97
|
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ Jeweler::Tasks.new do |gem|
|
|
17
17
|
gem.name = "bio-alignment"
|
18
18
|
gem.homepage = "http://github.com/pjotrp/bioruby-alignment"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{
|
21
|
-
gem.description = %Q{Alignment handler for multiple sequence alignments
|
20
|
+
gem.summary = %Q{Support for multiple sequence alignments (MSA)}
|
21
|
+
gem.description = %Q{Alignment handler for multiple sequence alignments (MSA)}
|
22
22
|
gem.email = "pjotr.public01@thebird.nl"
|
23
23
|
gem.authors = ["Pjotr Prins"]
|
24
24
|
# dependencies defined in Gemfile
|
@@ -31,12 +31,11 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
|
|
31
31
|
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
32
|
end
|
33
33
|
|
34
|
-
|
35
|
-
|
36
|
-
spec.rcov = true
|
34
|
+
require 'cucumber/rake/task'
|
35
|
+
Cucumber::Rake::Task.new do |features|
|
37
36
|
end
|
38
37
|
|
39
|
-
task :default => :spec
|
38
|
+
task :default => [ :cucumber, :spec ]
|
40
39
|
|
41
40
|
require 'rdoc/task'
|
42
41
|
Rake::RDocTask.new do |rdoc|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/doc/bio-alignment-design.md
CHANGED
@@ -10,8 +10,9 @@ items in the matrix (mostly because underlying sequences are String
|
|
10
10
|
based). This means a developer has to track information in multiple
|
11
11
|
places, for example a base pair quality score. This makes code complex
|
12
12
|
and therefore error prone. With bio-alignment elements of the matrix
|
13
|
-
can carry information.
|
14
|
-
the element
|
13
|
+
can carry information. So, when the alignment gets edited,
|
14
|
+
the element gets moved or deleted, and the information moves or
|
15
|
+
deletes along. For example,
|
15
16
|
say we have a nucleotide sequence with pay load
|
16
17
|
|
17
18
|
A G T A
|
@@ -19,14 +20,14 @@ say we have a nucleotide sequence with pay load
|
|
19
20
|
5 9 * 1
|
20
21
|
|
21
22
|
most library implementations will have two strings "AGTA" and "59*1".
|
22
|
-
Removing the third nucleodide would mean removing it twice, first
|
23
|
-
"AGA",
|
24
|
-
have one object for each element
|
25
|
-
payload of T is
|
23
|
+
Removing the third nucleodide would mean removing it twice, into first
|
24
|
+
"AGA", and second "591". With bio-alignment this is one action because we
|
25
|
+
have one object for each element that contains both values, e.g. the
|
26
|
+
payload of 'T' is '*'. Moving 'T' automatically moves '*'.
|
26
27
|
|
27
|
-
In addition bio-alignment deals with codons and codon translation.
|
28
|
+
In addition the bio-alignment library deals with codons and codon translation.
|
28
29
|
Rather than track mulitiple matrices, the codon is viewed as an element,
|
29
|
-
and the translated codon as the pay load.
|
30
|
+
and the translated codon as the pay load. Again, when an alignment gets
|
30
31
|
reordered the code only has to do it in one place.
|
31
32
|
|
32
33
|
Likewise, an alignment column can have a pay load (e.g. quality score
|
@@ -36,6 +37,97 @@ matrix element, column, or row 'attributes'.
|
|
36
37
|
|
37
38
|
Many of these ideas came from my work on the [BioScala
|
38
39
|
project](https://github.com/pjotrp/bioscala/blob/master/doc/design.txt),
|
39
|
-
The BioScala library has the advantage of type
|
40
|
+
The BioScala library has the additional advantage of having type
|
41
|
+
safety throughout.
|
42
|
+
|
43
|
+
## Row or Sequence
|
44
|
+
|
45
|
+
Any sequence for an alignment is simply a list of objects. The
|
46
|
+
requirement is that the list should be enumerable and can be indexed. This means
|
47
|
+
it has to include Enumerable and provide 'each' and '[]' methods. CodonSequence
|
48
|
+
is a good example.
|
49
|
+
|
50
|
+
In addition, elements in the list should respond to certain properties (see
|
51
|
+
below).
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
codons = CodonSequence.new(rec.id,rec.seq)
|
55
|
+
print codons.id
|
56
|
+
# get first codon
|
57
|
+
print codons.seq[0].to_s
|
58
|
+
```
|
59
|
+
|
60
|
+
where to_s is defined as part of the Sequence.
|
61
|
+
|
62
|
+
Normally, at the sequence level a pay load is possible. This can be a standard
|
63
|
+
attribute of the class. If a list of attributes exists in the
|
64
|
+
sequence object, it can be used. For Codons we can fetch the amino
|
65
|
+
acid with
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
print codons.seq[0].to_aa
|
69
|
+
```
|
70
|
+
|
71
|
+
in fact, because Sequence is indexable we can write directly
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
print codons[0].to_aa # 'M'
|
75
|
+
print codons[0].gap? # false
|
76
|
+
print codons[0].undefined? # false
|
77
|
+
```
|
78
|
+
|
79
|
+
and because CodonSequence is enumerable, and Codon has the to_aa method, we can
|
80
|
+
do a fancy
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
aaseq = codons.map { | codon | codon.to_aa }.join("")
|
84
|
+
```
|
85
|
+
|
86
|
+
## Element
|
87
|
+
|
88
|
+
Elements in the list should respond to a gap? method, for an alignment
|
89
|
+
gap, and the undefined? method for a position that is either an
|
90
|
+
element or a gap. Also it should respont to the to_s method.
|
91
|
+
|
92
|
+
An element can contain any pay load. If a list of attributes exists
|
93
|
+
in the sequence object, it can be used.
|
94
|
+
|
95
|
+
## Column
|
96
|
+
|
97
|
+
The column list tracks the columns of the alignment. The requirement
|
98
|
+
is that it should be iterable and can be indexed. The Column contains
|
99
|
+
no elements, but may point to a list when the alignment is transposed.
|
100
|
+
|
101
|
+
## Matrix or MSA
|
102
|
+
|
103
|
+
The Matrix consists of a Column list, multiple Sequences, in turn
|
104
|
+
consisting of Elements. Accessing the matrix is by Sequence, followed
|
105
|
+
by Element.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
require 'bio-alignment'
|
109
|
+
require 'bigbio' # for the Fasta reader
|
110
|
+
include Bio::BioAlignment # Namespace
|
111
|
+
aln = Alignment.new
|
112
|
+
fasta = FastaReader.new('test/data/fasta/codon/codon-alignment.fa')
|
113
|
+
fasta.each do | rec |
|
114
|
+
aln.sequences << rec
|
115
|
+
end
|
116
|
+
```
|
117
|
+
|
118
|
+
note that MSA understands rec, as long as rec.id and rec.seq exist, and strings
|
119
|
+
(req.seq is a String). Alternatively we can convert to a Codon sequence by
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
fasta.each do | rec |
|
123
|
+
aln.sequences << CodonSequence.new(rec.id,rec.seq)
|
124
|
+
end
|
125
|
+
```
|
126
|
+
|
127
|
+
The Matrix can be accessed in transposed fashion, but accessing the normal
|
128
|
+
matrix and transposed matrix at the same time is not supported. Matrix is not
|
129
|
+
designed to be transaction safe - though you can copy the Matrix any time.
|
130
|
+
|
131
|
+
|
40
132
|
|
41
133
|
Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>
|
@@ -0,0 +1,48 @@
|
|
1
|
+
$: << 'lib'
|
2
|
+
|
3
|
+
require 'bio-alignment'
|
4
|
+
require 'bigbio'
|
5
|
+
include Bio::BioAlignment # Namespace
|
6
|
+
|
7
|
+
Given /^I read an MSA nucleotide FASTA file in the test\/data folder$/ do
|
8
|
+
@aln = Alignment.new
|
9
|
+
fasta = FastaReader.new('test/data/fasta/codon/codon-alignment.fa')
|
10
|
+
fasta.each do | rec |
|
11
|
+
@aln.sequences << CodonSequence.new(rec.id, rec.seq)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
Given /^I iterate the sequence records$/ do
|
16
|
+
@aln.rows.each do | seq |
|
17
|
+
seq.id != nil
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
When /^I check the alignment$/ do
|
22
|
+
end
|
23
|
+
|
24
|
+
Then /^it should contain codons$/ do
|
25
|
+
# first sequence, first codon, translate
|
26
|
+
@aln.rows.first[0].to_aa.should == "M"
|
27
|
+
end
|
28
|
+
|
29
|
+
Then /^it should translate to an amino acid MSA$/ do
|
30
|
+
aaseq = @aln.rows.first.map { | codon | codon.to_aa }.join("")
|
31
|
+
aaseq[0..15].should == 'MPTRLDIVGNLQFSSS'
|
32
|
+
end
|
33
|
+
|
34
|
+
Then /^it should write a nucleotide alignment$/ do
|
35
|
+
# Writing is actually handles by a different library
|
36
|
+
fasta = FastaWriter.new('test/data/regression/nt-aln.fa')
|
37
|
+
@aln.rows.each do | row |
|
38
|
+
fasta.write(row)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
Then /^it should write an amino acid alignment$/ do
|
43
|
+
fasta = FastaWriter.new('test/data/regression/aa-aln.fa')
|
44
|
+
@aln.rows.each do | row |
|
45
|
+
fasta.write(row.id, row.to_aa.to_s)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
Feature: Read codon file
|
2
|
+
In order to read codon files into a codon alignment
|
3
|
+
I want to read a multi sequences aligned (MSA) nucleodtide FASTA file and store it internally as codons
|
4
|
+
|
5
|
+
Scenario: Support basic FASTA codon MSA
|
6
|
+
Given I read an MSA nucleotide FASTA file in the test/data folder
|
7
|
+
And I iterate the sequence records
|
8
|
+
When I check the alignment
|
9
|
+
Then it should contain codons
|
10
|
+
And it should translate to an amino acid MSA
|
11
|
+
And it should write a nucleotide alignment
|
12
|
+
And it should write an amino acid alignment
|
13
|
+
|
14
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'bigbio'
|
2
|
+
|
3
|
+
Given /^I have an amino acid alignment$/ do
|
4
|
+
@aln = Alignment.new
|
5
|
+
fasta = FastaReader.new('test/data/fasta/codon/aa-alignment.fa')
|
6
|
+
fasta.each do | rec |
|
7
|
+
@aln.sequences << Sequence.new(rec.id, rec.seq)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
Given /^I have matching nucleotide sequences$/ do
|
12
|
+
@aln2 = Alignment.new
|
13
|
+
fasta = FastaReader.new('test/data/fasta/codon/nt.fa')
|
14
|
+
fasta.each do | rec |
|
15
|
+
@aln2.sequences << Sequence.new(rec.id, rec.seq)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Then /^I should be able to generate a codon alignment$/ do
|
20
|
+
fasta = FastaWriter.new('test/data/regression/pal2nal.fa')
|
21
|
+
@aln.each_with_index do | aaseq, i |
|
22
|
+
ntseq = @aln2.sequences[i]
|
23
|
+
aaseq.id.should == ntseq.id
|
24
|
+
codonseq = CodonSequence.new(ntseq.id, ntseq.seq)
|
25
|
+
|
26
|
+
codon_pos = 0
|
27
|
+
result = []
|
28
|
+
aaseq.each do | aa |
|
29
|
+
result <<
|
30
|
+
if aa.gap?
|
31
|
+
'---'
|
32
|
+
else
|
33
|
+
codon_pos += 1
|
34
|
+
codonseq[codon_pos-1].to_s
|
35
|
+
end
|
36
|
+
end
|
37
|
+
fasta.write(aaseq.id, result.join(''))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^I should be able to generate a codon alignment directly with pal2nal$/ do
|
42
|
+
# pal2nal = @aln.pal2nal(@aln1)
|
43
|
+
pending
|
44
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Feature: pal2nal
|
2
|
+
pal2nal takes a protein (amino acid) alignment and a set of nucleotide
|
3
|
+
sequences and generates a codon alignment based on those
|
4
|
+
|
5
|
+
Scenario: Convert pal2nal
|
6
|
+
Given I have an amino acid alignment
|
7
|
+
And I have matching nucleotide sequences
|
8
|
+
Then I should be able to generate a codon alignment
|
9
|
+
Then I should be able to generate a codon alignment directly with pal2nal
|
10
|
+
|
data/lib/bio-alignment.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Alignment
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
class Alignment
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
attr_accessor :sequences
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@sequences = []
|
13
|
+
end
|
14
|
+
|
15
|
+
alias rows sequences
|
16
|
+
|
17
|
+
def each
|
18
|
+
rows.each { | seq | yield seq }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module BioAlignment
|
6
|
+
|
7
|
+
CODON_TABLE = Bio::CodonTable[1] # BioRuby Eukaryote table
|
8
|
+
|
9
|
+
# Codon element for the matrix
|
10
|
+
class Codon
|
11
|
+
def initialize codon
|
12
|
+
@codon = codon
|
13
|
+
end
|
14
|
+
|
15
|
+
def gap?
|
16
|
+
@codon == '---'
|
17
|
+
end
|
18
|
+
|
19
|
+
def undefined?
|
20
|
+
aa = CODON_TABLE[@codon]
|
21
|
+
if aa == nil and not gap?
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
@codon
|
29
|
+
end
|
30
|
+
|
31
|
+
# lazily convert to Amino acid (once only)
|
32
|
+
def to_aa
|
33
|
+
@aa ||= CODON_TABLE[@codon]
|
34
|
+
if not @aa
|
35
|
+
if gap?
|
36
|
+
return '-'
|
37
|
+
elsif undefined?
|
38
|
+
return 'X'
|
39
|
+
else
|
40
|
+
raise 'What?'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
@aa
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# A CodonSequence supports the concept of codons (triple
|
48
|
+
# nucleotides) for an alignment
|
49
|
+
#
|
50
|
+
class CodonSequence
|
51
|
+
include Enumerable
|
52
|
+
|
53
|
+
attr_reader :id, :seq
|
54
|
+
def initialize id, seq
|
55
|
+
@id = id
|
56
|
+
@seq = []
|
57
|
+
seq.scan(/\S\S\S/).each do | codon |
|
58
|
+
@seq << Codon.new(codon)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def [] index
|
63
|
+
@seq[index]
|
64
|
+
end
|
65
|
+
|
66
|
+
def each
|
67
|
+
@seq.each { | codon | yield codon }
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_s
|
71
|
+
@seq.map { |codon| codon.to_s }.join(' ')
|
72
|
+
end
|
73
|
+
|
74
|
+
# extra methods
|
75
|
+
|
76
|
+
def to_nt
|
77
|
+
@seq.map { |codon| codon.to_s }.join('')
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_aa
|
81
|
+
@seq.map { |codon| codon.to_aa }.join('')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|