bio-alignment 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -4
- data/README.md +94 -9
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/doc/bio-alignment-design.md +75 -11
- data/features/bioruby-feature.rb +17 -0
- data/features/bioruby.feature +6 -1
- data/features/columns-feature.rb +2 -0
- data/features/edit/del_bridges-feature.rb +7 -3
- data/features/edit/del_bridges.feature +1 -2
- data/features/edit/del_non_informative_sequences-feature.rb +26 -0
- data/features/edit/del_non_informative_sequences.feature +19 -0
- data/features/edit/del_short_sequences-feature.rb +21 -0
- data/features/edit/del_short_sequences.feature +25 -0
- data/features/edit/gblocks-feature.rb +2 -2
- data/features/edit/mask_islands-feature.rb +17 -4
- data/features/edit/mask_islands.feature +28 -17
- data/features/edit/mask_serial_mutations-feature.rb +8 -6
- data/features/edit/mask_serial_mutations.feature +11 -11
- data/features/tree-feature.rb +66 -0
- data/features/tree.feature +45 -0
- data/lib/bio-alignment.rb +4 -1
- data/lib/bio-alignment/alignment.rb +58 -3
- data/lib/bio-alignment/codonsequence.rb +14 -2
- data/lib/bio-alignment/columns.rb +102 -0
- data/lib/bio-alignment/edit/del_bridges.rb +18 -1
- data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
- data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
- data/lib/bio-alignment/edit/edit_columns.rb +22 -0
- data/lib/bio-alignment/edit/edit_rows.rb +49 -0
- data/lib/bio-alignment/edit/mask_islands.rb +115 -0
- data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
- data/lib/bio-alignment/elements.rb +86 -0
- data/lib/bio-alignment/rows.rb +52 -0
- data/lib/bio-alignment/sequence.rb +20 -14
- data/lib/bio-alignment/state.rb +64 -8
- data/lib/bio-alignment/tree.rb +77 -0
- data/spec/bio-alignment_spec.rb +57 -1
- data/spec/spec_helper.rb +3 -3
- metadata +47 -22
- data/lib/bio-alignment/column.rb +0 -47
data/Gemfile
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
gem "bio-logger"
|
3
|
-
gem "bio", ">= 1.4.2" # for translation tables
|
3
|
+
gem "bio", ">= 1.4.2" # for translation tables, BioRuby compat and Newick parser
|
4
4
|
|
5
5
|
# Add dependencies to develop your gem here.
|
6
6
|
# Include everything needed to run rake, tests, features, etc.
|
7
7
|
group :development do
|
8
|
-
gem "
|
8
|
+
gem "rake"
|
9
|
+
gem "bio-bigbio", "> 0.1.3" # for reading FASTA files in tests
|
9
10
|
gem "cucumber", ">= 0"
|
10
11
|
gem "rspec", "~> 2.3.0"
|
11
|
-
gem "bundler", "
|
12
|
-
gem "jeweler"
|
12
|
+
gem "bundler", ">= 1.0.21"
|
13
|
+
gem "jeweler"
|
13
14
|
end
|
data/README.md
CHANGED
@@ -29,6 +29,7 @@ aligmment (note codon gaps are represented by '---')
|
|
29
29
|
require 'bio-alignment'
|
30
30
|
require 'bigbio' # Fasta reader and writer
|
31
31
|
|
32
|
+
include Bio::BioAlignment
|
32
33
|
aln = Alignment.new
|
33
34
|
fasta = FastaReader.new('codon-alignment.fa')
|
34
35
|
fasta.each do | rec |
|
@@ -81,11 +82,13 @@ BioAlignment supports adding BioRuby's Bio::Sequence objects:
|
|
81
82
|
|
82
83
|
```ruby
|
83
84
|
require 'bio' # BioRuby
|
85
|
+
require 'bio-alignment'
|
84
86
|
require 'bio-alignment/bioruby' # make Bio::Sequence enumerable
|
85
|
-
|
87
|
+
include Bio::BioAlignment
|
88
|
+
|
86
89
|
aln = Alignment.new
|
87
|
-
aln << Bio::Sequence::NA.new("atgcatgcaaaa")
|
88
|
-
aln << Bio::Sequence::NA.new("atg---tcaaaa")
|
90
|
+
aln.sequences << Bio::Sequence::NA.new("atgcatgcaaaa")
|
91
|
+
aln.sequences << Bio::Sequence::NA.new("atg---tcaaaa")
|
89
92
|
```
|
90
93
|
|
91
94
|
and we can transform BioAlignment into BioRuby's Bio::Alignment and
|
@@ -146,21 +149,103 @@ version of pal2nal includes validation
|
|
146
149
|
|
147
150
|
resulting in the codon alignment.
|
148
151
|
|
149
|
-
###
|
152
|
+
### Phylogeny
|
153
|
+
|
154
|
+
BioAlignment has support for attaching a phylogentic tree to an
|
155
|
+
alignment, and traversing the tree.
|
156
|
+
|
157
|
+
### Alignment marking/masking/editing
|
158
|
+
|
159
|
+
One of the primary reasons for creating BioAlignment is to provide
|
160
|
+
easy ways of editing alignments using a functional style of
|
161
|
+
programming. Primitives are provided which take out much of the
|
162
|
+
plumbing, such as maintaining row/column/element state, and allow
|
163
|
+
copy-on-edit (so no conflicts arise in concurrent code). For example,
|
164
|
+
to walk an alignment by row, and update the row state, you can mark
|
165
|
+
all rows for deletion which contain many gaps
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
include MarkRows
|
169
|
+
mark_rows { |rowstate,row| # for every row/sequence
|
170
|
+
num = row.count { |e| e.gap? }
|
171
|
+
if (num.to_f/row.length) > 0.5
|
172
|
+
rowstate.delete! # mark row for deletion
|
173
|
+
end
|
174
|
+
rowstate # returns the updated row state
|
175
|
+
}
|
176
|
+
```
|
177
|
+
|
178
|
+
next, return a (deep) copy of the original alignment with the rows
|
179
|
+
that are not marked for deletion
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
aln2 = aln.rows_where { |row| !row.state.deleted? }
|
183
|
+
```
|
184
|
+
|
185
|
+
The general idea is that there are many potential ways of selecting
|
186
|
+
rows, and changing some state. The 'mark_rows' function/iterator takes
|
187
|
+
care of the plumbing. All the programmer needs to do is to set the
|
188
|
+
criterion, in this case a gap percentage, and tell the library what
|
189
|
+
state has to change. In this example we only access one row, but you
|
190
|
+
can also access the other rows. You won't be surprised that marking
|
191
|
+
columns looks much the same
|
150
192
|
|
151
|
-
|
193
|
+
```ruby
|
194
|
+
include MarkColumns
|
195
|
+
mark_columns { |colstate,col| # for every column
|
196
|
+
num = col.count { |e| e.gap? }
|
197
|
+
if (num.to_f/col.length) > 0.5
|
198
|
+
colstate.delete!
|
199
|
+
end
|
200
|
+
colstate
|
201
|
+
}
|
202
|
+
```
|
203
|
+
|
204
|
+
''count'' is one of the universal functions that counts elements in a
|
205
|
+
row, column, or alignment.
|
206
|
+
|
207
|
+
Next to modifying the state of rows and columns, you can also access
|
208
|
+
the state of alignment elements (i.e. codons, amino acids, nucleotide
|
209
|
+
acids). For example, here we mask every element that has a masked
|
210
|
+
state
|
211
|
+
|
212
|
+
```ruby
|
213
|
+
aln = masked_aln.update_each_element { |e| (e.state.masked? ? Element.new("X"):e)}
|
214
|
+
```
|
215
|
+
|
216
|
+
and, here we remove every marked element by turning it into a gap
|
217
|
+
|
218
|
+
```ruby
|
219
|
+
aln = marked_aln.update_each_element { |e| (e.state.marked? ? Element.new("-"):e)}
|
220
|
+
```
|
221
|
+
|
222
|
+
''update_each_element'' visits every element in the MSA, and replaces
|
223
|
+
the old with the new.
|
224
|
+
|
225
|
+
It is important to note that, instead of directly editing alignments
|
226
|
+
in place, this module always makes it a two step process. First items
|
227
|
+
are masked/marked through the state of the rows/columns/elements, next
|
228
|
+
the alignment is rewritten using this state. The advantage of using an
|
229
|
+
intermediate state is that the state can be queried for creating (for
|
230
|
+
example) nice output/graphics, using both the original and changed
|
231
|
+
alignments. For example, it is really easy to create a nice output
|
232
|
+
showing which columns were deleted in the original alignment, or which
|
233
|
+
amino acids were masked. Still, methods are available, which hide the
|
234
|
+
two step process, as seen in the next example.
|
235
|
+
|
236
|
+
BioAlignment supports many alignment editing features, which are
|
152
237
|
listed
|
153
238
|
[here](https://github.com/pjotrp/bioruby-alignment/tree/master/features/edit).
|
154
|
-
|
239
|
+
An edit feature is added at runtime(!) Example:
|
155
240
|
|
156
241
|
```ruby
|
157
242
|
require 'bio-alignment/edit/del_bridges'
|
158
243
|
|
159
|
-
aln.extend DelBridges
|
160
|
-
aln2 = aln.
|
244
|
+
aln.extend DelBridges # mix the module into the object
|
245
|
+
aln2 = aln.del_bridges # execute the alignment editor
|
161
246
|
```
|
162
247
|
|
163
|
-
|
248
|
+
where aln2 is a copy of aln with bridging columns deleted.
|
164
249
|
|
165
250
|
### See also
|
166
251
|
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
data/doc/bio-alignment-design.md
CHANGED
@@ -70,7 +70,7 @@ acid with
|
|
70
70
|
print codons.seq[0].to_aa
|
71
71
|
```
|
72
72
|
|
73
|
-
in fact, because Sequence is
|
73
|
+
in fact, because Sequence is index-able we can write directly
|
74
74
|
|
75
75
|
```ruby
|
76
76
|
print codons[0].to_aa # 'M'
|
@@ -94,6 +94,16 @@ element or a gap. Also it should respond to the to_s method.
|
|
94
94
|
An element can contain any pay load. If a list of attributes exists
|
95
95
|
in the sequence object, it can be used.
|
96
96
|
|
97
|
+
## Elements and CodonSequence
|
98
|
+
|
99
|
+
Where the Sequence class is the most basic String representation of a sequence, we
|
100
|
+
also have the Elements class, which allows each element in a coding sequence to
|
101
|
+
carry state.
|
102
|
+
|
103
|
+
The third list type we normally use in an Alignment, next to Sequence and
|
104
|
+
Elements, is the CodonSequence (remember, you can easily roll your own Sequence
|
105
|
+
type).
|
106
|
+
|
97
107
|
## Column
|
98
108
|
|
99
109
|
The column list tracks the columns of the alignment. The requirement
|
@@ -135,26 +145,80 @@ The Matrix can be accessed in transposed fashion, but accessing the normal
|
|
135
145
|
matrix and transposed matrix at the same time is not supported. Matrix is not
|
136
146
|
designed to be transaction safe - though you can copy the Matrix any time.
|
137
147
|
|
148
|
+
|
138
149
|
## Adding functionality
|
139
150
|
|
140
|
-
To ascertain that the basic BioAlignment does not get
|
141
|
-
is added by
|
142
|
-
|
143
|
-
|
144
|
-
|
151
|
+
To ascertain that the basic BioAlignment implementation does not get
|
152
|
+
polluted, extra functionality is added by using modules. These
|
153
|
+
modules can be added at run time(!) One advantage is that there is
|
154
|
+
less name space pollution, the other is that different implementations
|
155
|
+
can be plugged in - using the same interface. For example, here we are
|
156
|
+
going to use an alignment editor named DelBridges, which has a method
|
157
|
+
named del_bridges:
|
145
158
|
|
146
159
|
```ruby
|
147
160
|
require 'bio-alignment/edit/del_bridges'
|
148
161
|
|
149
162
|
aln = Alignment.new(string.split(/\n/))
|
150
163
|
aln.extend DelBridges # bring the module into scope
|
151
|
-
aln2 = aln.
|
164
|
+
aln2 = aln.del_bridges
|
165
|
+
```
|
166
|
+
|
167
|
+
in other words, the functionality in DelBridges gets attached to the
|
168
|
+
aln instance at run time, without affecting any other instantiated
|
169
|
+
object(!) Also, when not requiring 'bio-alignment/edit/del_bridges',
|
170
|
+
the functionality is never visible, and never added to the
|
171
|
+
environment. This type of runtime plugin is something you can only do
|
172
|
+
in a dynamic language.
|
173
|
+
|
174
|
+
Likewise you may have your own sequence objects in an alignment. To register
|
175
|
+
deletion state, simply extend the sequence with the RowState module:
|
176
|
+
|
177
|
+
```ruby
|
178
|
+
require 'bio-alignment/state'
|
179
|
+
bioseq = Bio::Sequence::NA.new("AGCT")
|
180
|
+
bioseq.extend(State) # add state
|
181
|
+
bioseq.state = RowState.new # set state
|
182
|
+
p mysequence.state.deleted? # query state
|
183
|
+
> false
|
152
184
|
```
|
153
185
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
186
|
+
That is impressive - the BioRuby Sequence has no deletion state facility. We
|
187
|
+
just added that, and it can even be used in BioAlignment editors which require
|
188
|
+
such a state object. See also the scenario "Give deletion state to a
|
189
|
+
Bio::Sequence object" in the bioruby.feature.
|
190
|
+
|
191
|
+
Note: if we wanted only to allow one plugin per instance at a time, we can
|
192
|
+
create a generic interface with a method of the same name for every
|
193
|
+
plugged in module. This ascertains that the same method can not be invoked from
|
194
|
+
multiple plugins (by default).
|
195
|
+
|
196
|
+
## Adding Phylogenetic support
|
197
|
+
|
198
|
+
MSAs often come with phylogenetic trees. Not to add this functionality by default,
|
199
|
+
we extend BioAlignment with BioAlignment::AlignmentTree when a tree is plugged in
|
200
|
+
with the add_tree method.
|
201
|
+
|
202
|
+
## Methods returning alignments and concurrency
|
203
|
+
|
204
|
+
When an alignment gets changed, e.g. by one of the editing modules, the
|
205
|
+
original is copied using the 'clone' method. The idea is never to share data in
|
206
|
+
this library. Ruby does not really have guaranteed immutable data, so the only
|
207
|
+
safe way to write concurrent code is to copy all data before changing. The
|
208
|
+
'clone' methods implemented in the Alignment class are 'deep' clones.
|
209
|
+
|
210
|
+
Not only is copying a good idea for concurrency (and lazy caching of
|
211
|
+
values), but it also allows one to write succinct and descriptive code
|
212
|
+
in functional style, such as
|
213
|
+
|
214
|
+
```ruby
|
215
|
+
aln2 = aln.mark_bridges.columns_where { |col| !col.state.deleted? }
|
216
|
+
```
|
158
217
|
|
218
|
+
where aln2 is a copy (of aln) with columns removed that were marked for
|
219
|
+
deletion. In other words, we apply ''Functional programming in Ruby.'' If
|
220
|
+
functions can be easily 'piped', and code can be easily copy and pasted into
|
221
|
+
different algorithms, it is likely that the module is written in a functional
|
222
|
+
style.
|
159
223
|
|
160
224
|
Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>
|
data/features/bioruby-feature.rb
CHANGED
@@ -82,3 +82,20 @@ Then /^I should have a BioRuby Bio::Alignment$/ do
|
|
82
82
|
@bioruby_alignment.consensus_iupac[0..8].should == '???????v?'
|
83
83
|
end
|
84
84
|
|
85
|
+
Given /^I have a BioRuby sequence object$/ do
|
86
|
+
@bioseq = Bio::Sequence::NA.new("AGCT")
|
87
|
+
end
|
88
|
+
|
89
|
+
When /^I add RowState$/ do
|
90
|
+
require 'bio-alignment/state'
|
91
|
+
@bioseq.extend State
|
92
|
+
@bioseq.state = RowState.new
|
93
|
+
@bioseq.state.deleted?.should == false
|
94
|
+
end
|
95
|
+
|
96
|
+
Then /^I should be able to change the delete state$/ do
|
97
|
+
@bioseq.state.delete!
|
98
|
+
@bioseq.state.deleted?.should == true
|
99
|
+
end
|
100
|
+
|
101
|
+
|
data/features/bioruby.feature
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
+
@bioruby
|
1
2
|
Feature: BioAlignment should play with BioRuby
|
2
3
|
In order to use BioRuby functionality
|
3
4
|
I want to convert BioAlignment to Bio::Alignment
|
4
5
|
And I want to support Bio::Sequence objects
|
5
6
|
|
6
|
-
@bioruby
|
7
7
|
Scenario: Use Bio::Sequence to fill BioAlignment
|
8
8
|
Given I have multiple Bio::Sequence objects
|
9
9
|
When I assign BioAlignment
|
@@ -22,3 +22,8 @@ Feature: BioAlignment should play with BioRuby
|
|
22
22
|
Given I have a BioAlignment
|
23
23
|
When I convert
|
24
24
|
Then I should have a BioRuby Bio::Alignment
|
25
|
+
|
26
|
+
Scenario: Give deletion state to a Bio::Sequence object
|
27
|
+
Given I have a BioRuby sequence object
|
28
|
+
When I add RowState
|
29
|
+
Then I should be able to change the delete state
|
data/features/columns-feature.rb
CHANGED
@@ -7,15 +7,19 @@ end
|
|
7
7
|
|
8
8
|
When /^I apply the bridge rule$/ do
|
9
9
|
@aln.extend DelBridges
|
10
|
-
aln2 = @aln.
|
10
|
+
@aln2 = @aln.mark_bridges
|
11
11
|
end
|
12
12
|
|
13
13
|
Then /^it should have removed (\d+) bridges$/ do |arg1, string|
|
14
|
-
|
14
|
+
check_aln = Alignment.new(string.split(/\n/))
|
15
|
+
new_aln = @aln.del_bridges
|
16
|
+
new_aln.to_s.should == check_aln.to_s
|
15
17
|
end
|
16
18
|
|
17
19
|
Then /^I should be able to track removed columns$/ do
|
18
|
-
|
20
|
+
@aln2.columns.count { |col| col.state.deleted? }.should == 6
|
21
|
+
@aln2.columns[0].state.deleted?.should == true
|
22
|
+
@aln2.columns[8].state.deleted?.should_not == true
|
19
23
|
end
|
20
24
|
|
21
25
|
|
@@ -5,7 +5,6 @@ Feature: Alignment editing, the bridge rule
|
|
5
5
|
|
6
6
|
The dropped columns are tracked by the table columns.
|
7
7
|
|
8
|
-
@dev
|
9
8
|
Scenario: Apply bridge rule to an amino acid alignment
|
10
9
|
Given I have a bridged alignment
|
11
10
|
"""
|
@@ -20,7 +19,7 @@ Feature: Alignment editing, the bridge rule
|
|
20
19
|
-------------IFHAVR-TC-HP-----------------
|
21
20
|
"""
|
22
21
|
When I apply the bridge rule
|
23
|
-
Then it should have removed
|
22
|
+
Then it should have removed 6 bridges
|
24
23
|
"""
|
25
24
|
SNSFSRPTIIFSGCSTACSGKSELVCGFRSFMLSDV
|
26
25
|
SNSFSRPTIIFSGCSTACSGKSEQVCGFR---LSDV
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'bio-alignment/edit/del_non_informative_sequences'
|
2
|
+
|
3
|
+
Given /^I have a bridged alignment containing unknown amino acids$/ do |string|
|
4
|
+
@aln = nil
|
5
|
+
@aln2 = nil
|
6
|
+
@aln = Alignment.new(string.split(/\n/))
|
7
|
+
@aln.extend DelNonInformativeSequences
|
8
|
+
end
|
9
|
+
|
10
|
+
When /^I apply the non\-informative sequence rule$/ do
|
11
|
+
@aln2 = @aln.mark_non_informative_sequences
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^it should have removed two rows$/ do |string|
|
15
|
+
check_aln = Alignment.new(string.split(/\n/))
|
16
|
+
new_aln = @aln.del_non_informative_sequences
|
17
|
+
new_aln.to_s.should == check_aln.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
Then /^I should be able to track removed non\-informative rows$/ do
|
21
|
+
@aln2.rows.count { |row| row.state.deleted? }.should == 2
|
22
|
+
@aln2.rows[0].state.deleted?.should == false
|
23
|
+
@aln2.rows[3].state.deleted?.should == true
|
24
|
+
@aln2.rows[4].state.deleted?.should == true
|
25
|
+
end
|
26
|
+
|
@@ -2,3 +2,22 @@ Feature: Remove non-informative sequences
|
|
2
2
|
|
3
3
|
After alignment cleaning, it may be we have non-informative sequences. These
|
4
4
|
can be removed from the alignment.
|
5
|
+
|
6
|
+
Scenario: Apply non informative sequence rule to an amino acid alignment
|
7
|
+
Given I have a bridged alignment containing unknown amino acids
|
8
|
+
"""
|
9
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
10
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
11
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
12
|
+
----------XTIXXXXXXXXXSGK--SELXXXXXSFXXXXV
|
13
|
+
-------------IFHAVR-TC-HP-----------------
|
14
|
+
"""
|
15
|
+
When I apply the non-informative sequence rule
|
16
|
+
Then it should have removed two rows
|
17
|
+
"""
|
18
|
+
SSIISNSFSRPTIIFSGCSTACSGK--SEQVCGFR---LSDV
|
19
|
+
SSIISNSFSRPTIIFSGCSTACSGKLTSEQVCGFR---LSDV
|
20
|
+
----------PTIIFSGCSKACSGK-----VCGIFHAVRSFM
|
21
|
+
"""
|
22
|
+
Then I should be able to track removed non-informative rows
|
23
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'bio-alignment/edit/del_short_sequences'
|
2
|
+
|
3
|
+
When /^I apply the short sequence rule$/ do
|
4
|
+
@aln.extend DelShortSequences
|
5
|
+
@aln2 = @aln.mark_short_sequences
|
6
|
+
end
|
7
|
+
|
8
|
+
Then /^it should have removed one row$/ do |string|
|
9
|
+
check_aln = Alignment.new(string.split(/\n/))
|
10
|
+
new_aln = @aln.del_short_sequences
|
11
|
+
print new_aln.to_s
|
12
|
+
new_aln.to_s.should == check_aln.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
Then /^I should be able to track removed rows$/ do
|
16
|
+
@aln2.rows.count { |row| row.state.deleted? }.should == 1
|
17
|
+
@aln2.rows[0].state.deleted?.should == false
|
18
|
+
@aln2.rows[4].state.deleted?.should == true
|
19
|
+
end
|
20
|
+
|
21
|
+
|