bio-alignment 0.0.1.alpha → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -4
- data/README.md +73 -4
- data/Rakefile +5 -6
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +101 -9
- data/features/codon-feature.rb +48 -0
- data/features/codon.feature +14 -0
- data/features/pal2nal-feature.rb +44 -0
- data/features/pal2nal.feature +10 -0
- data/lib/bio-alignment.rb +2 -0
- data/lib/bio-alignment/alignment.rb +22 -0
- data/lib/bio-alignment/codonsequence.rb +87 -0
- data/lib/bio-alignment/sequence.rb +39 -0
- data/spec/bio-alignment_spec.rb +27 -2
- data/test/data/fasta/codon/aa-alignment.fa +84 -0
- data/test/data/fasta/codon/codon-alignment.fa +264 -0
- data/test/data/fasta/codon/nt.fa +24 -0
- data/test/data/regression/aa-aln.fa +24 -0
- data/test/data/regression/nt-aln.fa +24 -0
- data/test/data/regression/pal2nal.fa +24 -0
- metadata +71 -27
data/Gemfile
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
|
3
|
-
#
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
2
|
+
gem "bio-logger"
|
3
|
+
gem "bio", ">= 1.4.2" # for translation tables
|
5
4
|
|
6
5
|
# Add dependencies to develop your gem here.
|
7
6
|
# Include everything needed to run rake, tests, features, etc.
|
8
7
|
group :development do
|
8
|
+
gem "bio-bigbio", "> 0.1.3" # for FASTA files in tests
|
9
|
+
gem "cucumber", ">= 0"
|
9
10
|
gem "rspec", "~> 2.3.0"
|
10
11
|
gem "bundler", "~> 1.0.0"
|
11
12
|
gem "jeweler", "~> 1.7.0"
|
12
|
-
gem "bio", ">= 1.4.2"
|
13
13
|
end
|
data/README.md
CHANGED
@@ -9,20 +9,89 @@ nucleotide quality score, codon annotation). The only requirement is
|
|
9
9
|
that the list is iterable and can be indexed.
|
10
10
|
|
11
11
|
This work is based on Pjotr's experience designing the BioScala
|
12
|
-
Alignment handler and BioRuby's PAML support.
|
13
|
-
|
12
|
+
Alignment handler and BioRuby's PAML support. Read the
|
13
|
+
Bio::BioAlignment
|
14
|
+
[design
|
15
|
+
document](https://github.com/pjotrp/bioruby-alignment/blob/master/doc/bio-alignment-design.md)
|
16
|
+
for Ruby.
|
14
17
|
|
15
18
|
Note: this software is under active development.
|
16
19
|
|
17
20
|
## Developers
|
18
21
|
|
19
|
-
|
22
|
+
### Codon alignment example
|
23
|
+
|
24
|
+
To use the library, load aligned sequences into the Alignment
|
25
|
+
matrix. Here we write an amino acid alignment from a codon
|
26
|
+
aligmment (note codon gaps are represented by '---')
|
20
27
|
|
21
28
|
```ruby
|
22
29
|
require 'bio-alignment'
|
30
|
+
require 'bigbio' # Fasta reader and writer
|
31
|
+
|
32
|
+
aln = Alignment.new
|
33
|
+
fasta = FastaReader.new('codon-alignment.fa')
|
34
|
+
fasta.each do | rec |
|
35
|
+
aln.sequences << CodonSequence.new(rec.id, rec.seq)
|
36
|
+
end
|
37
|
+
# write a matching amino acid alignment
|
38
|
+
fasta = FastaWriter.new('aa-aln.fa')
|
39
|
+
aln.rows.each do | row |
|
40
|
+
fasta.write(row.id, row.to_aa.to_s)
|
41
|
+
end
|
42
|
+
```
|
43
|
+
|
44
|
+
### Pal2nal
|
45
|
+
|
46
|
+
A protein (amino acid) to nucleotide alignment would first load
|
47
|
+
the sequences
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
aln1 = Alignment.new
|
51
|
+
fasta1 = FastaWriter.new('aa-aln.fa')
|
52
|
+
aln1.rows.each do | row |
|
53
|
+
fasta1.write(row.id, row.to_aa.to_s)
|
54
|
+
end
|
55
|
+
aln2 = Alignment.new
|
56
|
+
fasta2 = FastaReader.new('nt.fa')
|
57
|
+
fasta2.each do | rec |
|
58
|
+
aln2.sequences << Sequence.new(rec.id, rec.seq)
|
59
|
+
end
|
60
|
+
```
|
61
|
+
|
62
|
+
Write a (simple) version of pal2nal would be something like
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
fasta3 = FastaWriter.new('nt-aln.fa')
|
66
|
+
aln.each_with_index do | aaseq, i |
|
67
|
+
ntseq = aln2.sequences[i]
|
68
|
+
aaseq.id.should == ntseq.id
|
69
|
+
codonseq = CodonSequence.new(ntseq.id, ntseq.seq)
|
70
|
+
codon_pos = 0
|
71
|
+
result = []
|
72
|
+
aaseq.each do | aa |
|
73
|
+
result <<
|
74
|
+
if aa.gap?
|
75
|
+
'---'
|
76
|
+
else
|
77
|
+
codon_pos += 1
|
78
|
+
codonseq[codon_pos-1].to_s
|
79
|
+
end
|
80
|
+
end
|
81
|
+
fasta3.write(aaseq.id, result.join(''))
|
82
|
+
end
|
83
|
+
```
|
84
|
+
|
85
|
+
With aln1 and aln2, the library version is the shorter
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
aln3 = aln1.pal2nal(aln2)
|
89
|
+
fasta3 = FastaWriter.new('nt-aln.fa')
|
90
|
+
aln3.each { | rec | fasta3.write(rec) }
|
23
91
|
```
|
24
92
|
|
25
|
-
The API
|
93
|
+
The API documentation is online. For more code examples see ./spec/*.rb and
|
94
|
+
./features/*
|
26
95
|
|
27
96
|
## Cite
|
28
97
|
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ Jeweler::Tasks.new do |gem|
|
|
17
17
|
gem.name = "bio-alignment"
|
18
18
|
gem.homepage = "http://github.com/pjotrp/bioruby-alignment"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{
|
21
|
-
gem.description = %Q{Alignment handler for multiple sequence alignments
|
20
|
+
gem.summary = %Q{Support for multiple sequence alignments (MSA)}
|
21
|
+
gem.description = %Q{Alignment handler for multiple sequence alignments (MSA)}
|
22
22
|
gem.email = "pjotr.public01@thebird.nl"
|
23
23
|
gem.authors = ["Pjotr Prins"]
|
24
24
|
# dependencies defined in Gemfile
|
@@ -31,12 +31,11 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
|
|
31
31
|
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
32
|
end
|
33
33
|
|
34
|
-
|
35
|
-
|
36
|
-
spec.rcov = true
|
34
|
+
require 'cucumber/rake/task'
|
35
|
+
Cucumber::Rake::Task.new do |features|
|
37
36
|
end
|
38
37
|
|
39
|
-
task :default => :spec
|
38
|
+
task :default => [ :cucumber, :spec ]
|
40
39
|
|
41
40
|
require 'rdoc/task'
|
42
41
|
Rake::RDocTask.new do |rdoc|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/doc/bio-alignment-design.md
CHANGED
@@ -10,8 +10,9 @@ items in the matrix (mostly because underlying sequences are String
|
|
10
10
|
based). This means a developer has to track information in multiple
|
11
11
|
places, for example a base pair quality score. This makes code complex
|
12
12
|
and therefore error prone. With bio-alignment elements of the matrix
|
13
|
-
can carry information.
|
14
|
-
the element
|
13
|
+
can carry information. So, when the alignment gets edited,
|
14
|
+
the element gets moved or deleted, and the information moves or
|
15
|
+
deletes along. For example,
|
15
16
|
say we have a nucleotide sequence with pay load
|
16
17
|
|
17
18
|
A G T A
|
@@ -19,14 +20,14 @@ say we have a nucleotide sequence with pay load
|
|
19
20
|
5 9 * 1
|
20
21
|
|
21
22
|
most library implementations will have two strings "AGTA" and "59*1".
|
22
|
-
Removing the third nucleodide would mean removing it twice, first
|
23
|
-
"AGA",
|
24
|
-
have one object for each element
|
25
|
-
payload of T is
|
23
|
+
Removing the third nucleodide would mean removing it twice, into first
|
24
|
+
"AGA", and second "591". With bio-alignment this is one action because we
|
25
|
+
have one object for each element that contains both values, e.g. the
|
26
|
+
payload of 'T' is '*'. Moving 'T' automatically moves '*'.
|
26
27
|
|
27
|
-
In addition bio-alignment deals with codons and codon translation.
|
28
|
+
In addition the bio-alignment library deals with codons and codon translation.
|
28
29
|
Rather than track mulitiple matrices, the codon is viewed as an element,
|
29
|
-
and the translated codon as the pay load.
|
30
|
+
and the translated codon as the pay load. Again, when an alignment gets
|
30
31
|
reordered the code only has to do it in one place.
|
31
32
|
|
32
33
|
Likewise, an alignment column can have a pay load (e.g. quality score
|
@@ -36,6 +37,97 @@ matrix element, column, or row 'attributes'.
|
|
36
37
|
|
37
38
|
Many of these ideas came from my work on the [BioScala
|
38
39
|
project](https://github.com/pjotrp/bioscala/blob/master/doc/design.txt),
|
39
|
-
The BioScala library has the advantage of type
|
40
|
+
The BioScala library has the additional advantage of having type
|
41
|
+
safety throughout.
|
42
|
+
|
43
|
+
## Row or Sequence
|
44
|
+
|
45
|
+
Any sequence for an alignment is simply a list of objects. The
|
46
|
+
requirement is that the list should be enumerable and can be indexed. This means
|
47
|
+
it has to include Enumerable and provide 'each' and '[]' methods. CodonSequence
|
48
|
+
is a good example.
|
49
|
+
|
50
|
+
In addition, elements in the list should respond to certain properties (see
|
51
|
+
below).
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
codons = CodonSequence.new(rec.id,rec.seq)
|
55
|
+
print codons.id
|
56
|
+
# get first codon
|
57
|
+
print codons.seq[0].to_s
|
58
|
+
```
|
59
|
+
|
60
|
+
where to_s is defined as part of the Sequence.
|
61
|
+
|
62
|
+
Normally, at the sequence level a pay load is possible. This can be a standard
|
63
|
+
attribute of the class. If a list of attributes exists in the
|
64
|
+
sequence object, it can be used. For Codons we can fetch the amino
|
65
|
+
acid with
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
print codons.seq[0].to_aa
|
69
|
+
```
|
70
|
+
|
71
|
+
in fact, because Sequence is indexable we can write directly
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
print codons[0].to_aa # 'M'
|
75
|
+
print codons[0].gap? # false
|
76
|
+
print codons[0].undefined? # false
|
77
|
+
```
|
78
|
+
|
79
|
+
and because CodonSequence is enumerable, and Codon has the to_aa method, we can
|
80
|
+
do a fancy
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
aaseq = codons.map { | codon | codon.to_aa }.join("")
|
84
|
+
```
|
85
|
+
|
86
|
+
## Element
|
87
|
+
|
88
|
+
Elements in the list should respond to a gap? method, for an alignment
|
89
|
+
gap, and the undefined? method for a position that is either an
|
90
|
+
element or a gap. Also it should respont to the to_s method.
|
91
|
+
|
92
|
+
An element can contain any pay load. If a list of attributes exists
|
93
|
+
in the sequence object, it can be used.
|
94
|
+
|
95
|
+
## Column
|
96
|
+
|
97
|
+
The column list tracks the columns of the alignment. The requirement
|
98
|
+
is that it should be iterable and can be indexed. The Column contains
|
99
|
+
no elements, but may point to a list when the alignment is transposed.
|
100
|
+
|
101
|
+
## Matrix or MSA
|
102
|
+
|
103
|
+
The Matrix consists of a Column list, multiple Sequences, in turn
|
104
|
+
consisting of Elements. Accessing the matrix is by Sequence, followed
|
105
|
+
by Element.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
require 'bio-alignment'
|
109
|
+
require 'bigbio' # for the Fasta reader
|
110
|
+
include Bio::BioAlignment # Namespace
|
111
|
+
aln = Alignment.new
|
112
|
+
fasta = FastaReader.new('test/data/fasta/codon/codon-alignment.fa')
|
113
|
+
fasta.each do | rec |
|
114
|
+
aln.sequences << rec
|
115
|
+
end
|
116
|
+
```
|
117
|
+
|
118
|
+
note that MSA understands rec, as long as rec.id and rec.seq exist, and strings
|
119
|
+
(req.seq is a String). Alternatively we can convert to a Codon sequence by
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
fasta.each do | rec |
|
123
|
+
aln.sequences << CodonSequence.new(rec.id,rec.seq)
|
124
|
+
end
|
125
|
+
```
|
126
|
+
|
127
|
+
The Matrix can be accessed in transposed fashion, but accessing the normal
|
128
|
+
matrix and transposed matrix at the same time is not supported. Matrix is not
|
129
|
+
designed to be transaction safe - though you can copy the Matrix any time.
|
130
|
+
|
131
|
+
|
40
132
|
|
41
133
|
Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>
|
@@ -0,0 +1,48 @@
|
|
1
|
+
$: << 'lib'
|
2
|
+
|
3
|
+
require 'bio-alignment'
|
4
|
+
require 'bigbio'
|
5
|
+
include Bio::BioAlignment # Namespace
|
6
|
+
|
7
|
+
Given /^I read an MSA nucleotide FASTA file in the test\/data folder$/ do
|
8
|
+
@aln = Alignment.new
|
9
|
+
fasta = FastaReader.new('test/data/fasta/codon/codon-alignment.fa')
|
10
|
+
fasta.each do | rec |
|
11
|
+
@aln.sequences << CodonSequence.new(rec.id, rec.seq)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
Given /^I iterate the sequence records$/ do
|
16
|
+
@aln.rows.each do | seq |
|
17
|
+
seq.id != nil
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
When /^I check the alignment$/ do
|
22
|
+
end
|
23
|
+
|
24
|
+
Then /^it should contain codons$/ do
|
25
|
+
# first sequence, first codon, translate
|
26
|
+
@aln.rows.first[0].to_aa.should == "M"
|
27
|
+
end
|
28
|
+
|
29
|
+
Then /^it should translate to an amino acid MSA$/ do
|
30
|
+
aaseq = @aln.rows.first.map { | codon | codon.to_aa }.join("")
|
31
|
+
aaseq[0..15].should == 'MPTRLDIVGNLQFSSS'
|
32
|
+
end
|
33
|
+
|
34
|
+
Then /^it should write a nucleotide alignment$/ do
|
35
|
+
# Writing is actually handles by a different library
|
36
|
+
fasta = FastaWriter.new('test/data/regression/nt-aln.fa')
|
37
|
+
@aln.rows.each do | row |
|
38
|
+
fasta.write(row)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
Then /^it should write an amino acid alignment$/ do
|
43
|
+
fasta = FastaWriter.new('test/data/regression/aa-aln.fa')
|
44
|
+
@aln.rows.each do | row |
|
45
|
+
fasta.write(row.id, row.to_aa.to_s)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
Feature: Read codon file
|
2
|
+
In order to read codon files into a codon alignment
|
3
|
+
I want to read a multi sequences aligned (MSA) nucleodtide FASTA file and store it internally as codons
|
4
|
+
|
5
|
+
Scenario: Support basic FASTA codon MSA
|
6
|
+
Given I read an MSA nucleotide FASTA file in the test/data folder
|
7
|
+
And I iterate the sequence records
|
8
|
+
When I check the alignment
|
9
|
+
Then it should contain codons
|
10
|
+
And it should translate to an amino acid MSA
|
11
|
+
And it should write a nucleotide alignment
|
12
|
+
And it should write an amino acid alignment
|
13
|
+
|
14
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'bigbio'
|
2
|
+
|
3
|
+
Given /^I have an amino acid alignment$/ do
|
4
|
+
@aln = Alignment.new
|
5
|
+
fasta = FastaReader.new('test/data/fasta/codon/aa-alignment.fa')
|
6
|
+
fasta.each do | rec |
|
7
|
+
@aln.sequences << Sequence.new(rec.id, rec.seq)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
Given /^I have matching nucleotide sequences$/ do
|
12
|
+
@aln2 = Alignment.new
|
13
|
+
fasta = FastaReader.new('test/data/fasta/codon/nt.fa')
|
14
|
+
fasta.each do | rec |
|
15
|
+
@aln2.sequences << Sequence.new(rec.id, rec.seq)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Then /^I should be able to generate a codon alignment$/ do
|
20
|
+
fasta = FastaWriter.new('test/data/regression/pal2nal.fa')
|
21
|
+
@aln.each_with_index do | aaseq, i |
|
22
|
+
ntseq = @aln2.sequences[i]
|
23
|
+
aaseq.id.should == ntseq.id
|
24
|
+
codonseq = CodonSequence.new(ntseq.id, ntseq.seq)
|
25
|
+
|
26
|
+
codon_pos = 0
|
27
|
+
result = []
|
28
|
+
aaseq.each do | aa |
|
29
|
+
result <<
|
30
|
+
if aa.gap?
|
31
|
+
'---'
|
32
|
+
else
|
33
|
+
codon_pos += 1
|
34
|
+
codonseq[codon_pos-1].to_s
|
35
|
+
end
|
36
|
+
end
|
37
|
+
fasta.write(aaseq.id, result.join(''))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^I should be able to generate a codon alignment directly with pal2nal$/ do
|
42
|
+
# pal2nal = @aln.pal2nal(@aln1)
|
43
|
+
pending
|
44
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Feature: pal2nal
|
2
|
+
pal2nal takes a protein (amino acid) alignment and a set of nucleotide
|
3
|
+
sequences and generates a codon alignment based on those
|
4
|
+
|
5
|
+
Scenario: Convert pal2nal
|
6
|
+
Given I have an amino acid alignment
|
7
|
+
And I have matching nucleotide sequences
|
8
|
+
Then I should be able to generate a codon alignment
|
9
|
+
Then I should be able to generate a codon alignment directly with pal2nal
|
10
|
+
|
data/lib/bio-alignment.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Alignment
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module BioAlignment
|
5
|
+
|
6
|
+
class Alignment
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
attr_accessor :sequences
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@sequences = []
|
13
|
+
end
|
14
|
+
|
15
|
+
alias rows sequences
|
16
|
+
|
17
|
+
def each
|
18
|
+
rows.each { | seq | yield seq }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module BioAlignment
|
6
|
+
|
7
|
+
CODON_TABLE = Bio::CodonTable[1] # BioRuby Eukaryote table
|
8
|
+
|
9
|
+
# Codon element for the matrix
|
10
|
+
class Codon
|
11
|
+
def initialize codon
|
12
|
+
@codon = codon
|
13
|
+
end
|
14
|
+
|
15
|
+
def gap?
|
16
|
+
@codon == '---'
|
17
|
+
end
|
18
|
+
|
19
|
+
def undefined?
|
20
|
+
aa = CODON_TABLE[@codon]
|
21
|
+
if aa == nil and not gap?
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
@codon
|
29
|
+
end
|
30
|
+
|
31
|
+
# lazily convert to Amino acid (once only)
|
32
|
+
def to_aa
|
33
|
+
@aa ||= CODON_TABLE[@codon]
|
34
|
+
if not @aa
|
35
|
+
if gap?
|
36
|
+
return '-'
|
37
|
+
elsif undefined?
|
38
|
+
return 'X'
|
39
|
+
else
|
40
|
+
raise 'What?'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
@aa
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# A CodonSequence supports the concept of codons (triple
|
48
|
+
# nucleotides) for an alignment
|
49
|
+
#
|
50
|
+
class CodonSequence
|
51
|
+
include Enumerable
|
52
|
+
|
53
|
+
attr_reader :id, :seq
|
54
|
+
def initialize id, seq
|
55
|
+
@id = id
|
56
|
+
@seq = []
|
57
|
+
seq.scan(/\S\S\S/).each do | codon |
|
58
|
+
@seq << Codon.new(codon)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def [] index
|
63
|
+
@seq[index]
|
64
|
+
end
|
65
|
+
|
66
|
+
def each
|
67
|
+
@seq.each { | codon | yield codon }
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_s
|
71
|
+
@seq.map { |codon| codon.to_s }.join(' ')
|
72
|
+
end
|
73
|
+
|
74
|
+
# extra methods
|
75
|
+
|
76
|
+
def to_nt
|
77
|
+
@seq.map { |codon| codon.to_s }.join('')
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_aa
|
81
|
+
@seq.map { |codon| codon.to_aa }.join('')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|