bio-gff3 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README +65 -0
- data/README.rdoc +19 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/gff3-fetch +99 -0
- data/bio-gff3.gemspec +101 -0
- data/lib/bio-gff3.rb +0 -0
- data/lib/bio/db/gff/gffassemble.rb +300 -0
- data/lib/bio/db/gff/gffdb.rb +40 -0
- data/lib/bio/db/gff/gfffasta.rb +68 -0
- data/lib/bio/db/gff/gfffileiterator.rb +77 -0
- data/lib/bio/db/gff/gffinmemory.rb +63 -0
- data/lib/bio/db/gff/gffnocache.rb +124 -0
- data/lib/bio/db/gff/gffparser.rb +154 -0
- data/lib/bio/system/lruhash.rb +268 -0
- data/spec/gff3_assemble2_spec.rb +73 -0
- data/spec/gff3_assemble3_spec.rb +62 -0
- data/spec/gff3_assemble_spec.rb +291 -0
- data/spec/gff3_fileiterator_spec.rb +43 -0
- data/spec/gffdb_spec.rb +99 -0
- data/test/data/gff/MhA1_Contig1133.fa +2 -0
- data/test/data/gff/MhA1_Contig1133.gff3 +1862 -0
- data/test/data/gff/MhA1_Contig125.fa +673 -0
- data/test/data/gff/MhA1_Contig125.gff3 +2177 -0
- data/test/data/gff/standard.gff3 +25 -0
- data/test/data/gff/test-cds.gff3 +98 -0
- data/test/data/gff/test-ext-fasta.fa +16 -0
- data/test/data/gff/test-ext-fasta.gff3 +57 -0
- data/test/data/gff/test.gff3 +74 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-gff3.rb +7 -0
- metadata +180 -0
@@ -0,0 +1,268 @@
|
|
1
|
+
# LRU based Hash by Robert Klemme
|
2
|
+
#
|
3
|
+
# Copied from http://github.com/rklemme/muppet-laboratories
|
4
|
+
#
|
5
|
+
# License: unclear, not in repository, need to ask Permission
|
6
|
+
# Added by Pjotr Prins (pjotr.prins@thebird.nl)
|
7
|
+
|
8
|
+
require 'enumerator'
|
9
|
+
|
10
|
+
# Hash with LRU expiry policy. There are at most max_size elements in a
|
11
|
+
# LruHash. When adding more elements old elements are removed according
|
12
|
+
# to LRU policy.
|
13
|
+
class LRUHash
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
attr_reader :max_size
|
17
|
+
attr_accessor :default, :default_proc, :release_proc
|
18
|
+
|
19
|
+
def initialize(max_size, default_value = nil, &block)
|
20
|
+
@max_size = normalize_max(max_size)
|
21
|
+
@default = default_value
|
22
|
+
@default_proc = block
|
23
|
+
|
24
|
+
@h = {}
|
25
|
+
@head = Node.new
|
26
|
+
@tail = front(Node.new)
|
27
|
+
end
|
28
|
+
|
29
|
+
def each_pair
|
30
|
+
if block_given?
|
31
|
+
each_node do |n|
|
32
|
+
yield [n.key, n.value]
|
33
|
+
end
|
34
|
+
else
|
35
|
+
enum_for :each_pair
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
alias each each_pair
|
40
|
+
|
41
|
+
def each_key
|
42
|
+
if block_given?
|
43
|
+
each_node do |n|
|
44
|
+
yield n.key
|
45
|
+
end
|
46
|
+
else
|
47
|
+
enum_for :each_key
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def each_value
|
52
|
+
if block_given?
|
53
|
+
each_node do |n|
|
54
|
+
yield n.value
|
55
|
+
end
|
56
|
+
else
|
57
|
+
enum_for :each_value
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def size
|
62
|
+
@h.size
|
63
|
+
end
|
64
|
+
|
65
|
+
def empty?
|
66
|
+
@head.succ.equal? @tail
|
67
|
+
end
|
68
|
+
|
69
|
+
def fetch(key, &b)
|
70
|
+
n = @h[key]
|
71
|
+
|
72
|
+
if n
|
73
|
+
front(n).value
|
74
|
+
else
|
75
|
+
(b || FETCH)[key]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def [](key)
|
80
|
+
fetch(key) do |k|
|
81
|
+
@default_proc ? @default_proc[self, k] : default
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def keys
|
86
|
+
@h.keys
|
87
|
+
end
|
88
|
+
|
89
|
+
def values
|
90
|
+
@h.map {|k,n| n.value}
|
91
|
+
end
|
92
|
+
|
93
|
+
def has_key?(key)
|
94
|
+
@h.has_key? key
|
95
|
+
end
|
96
|
+
|
97
|
+
alias key? has_key?
|
98
|
+
alias member? has_key?
|
99
|
+
alias include? has_key?
|
100
|
+
|
101
|
+
def has_value?(value)
|
102
|
+
each_pair do |k, v|
|
103
|
+
return true if value.eql? v
|
104
|
+
end
|
105
|
+
|
106
|
+
false
|
107
|
+
end
|
108
|
+
|
109
|
+
alias value? has_value?
|
110
|
+
|
111
|
+
def values_at(*key_list)
|
112
|
+
key_list.map {|k| self[k]}
|
113
|
+
end
|
114
|
+
|
115
|
+
def assoc(key)
|
116
|
+
n = @h[key]
|
117
|
+
|
118
|
+
if n
|
119
|
+
front(n)
|
120
|
+
[n.key, n.value]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def rassoc(value)
|
125
|
+
each_node do |n|
|
126
|
+
if value.eql? n.value
|
127
|
+
front(n)
|
128
|
+
return [n.key, n.value]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
nil
|
132
|
+
end
|
133
|
+
|
134
|
+
def key(value)
|
135
|
+
pair = rassoc(value) and pair.first
|
136
|
+
end
|
137
|
+
|
138
|
+
def store(key, value)
|
139
|
+
# same optimization as in Hash
|
140
|
+
key = key.dup.freeze if String === key && !key.frozen?
|
141
|
+
|
142
|
+
n = @h[key]
|
143
|
+
|
144
|
+
unless n
|
145
|
+
if size == max_size
|
146
|
+
# reuse node to optimize memory usage
|
147
|
+
n = delete_oldest
|
148
|
+
n.key = key
|
149
|
+
n.value = value
|
150
|
+
else
|
151
|
+
n = Node.new key, value
|
152
|
+
end
|
153
|
+
|
154
|
+
@h[key] = n
|
155
|
+
end
|
156
|
+
|
157
|
+
front(n).value = value
|
158
|
+
end
|
159
|
+
|
160
|
+
alias []= store
|
161
|
+
|
162
|
+
def delete(key)
|
163
|
+
n = @h[key] and remove_node(n).value
|
164
|
+
end
|
165
|
+
|
166
|
+
def delete_if
|
167
|
+
each_node do |n|
|
168
|
+
remove_node n if yield n.key, n.value
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def max_size=(limit)
|
173
|
+
limit = normalize_max(limit)
|
174
|
+
|
175
|
+
while size > limit
|
176
|
+
delete_oldest
|
177
|
+
end
|
178
|
+
|
179
|
+
@max_size = limit
|
180
|
+
end
|
181
|
+
|
182
|
+
def clear
|
183
|
+
until empty?
|
184
|
+
delete_oldest
|
185
|
+
end
|
186
|
+
|
187
|
+
self
|
188
|
+
end
|
189
|
+
|
190
|
+
def to_s
|
191
|
+
s = nil
|
192
|
+
each_pair {|k, v| (s ? (s << ', ') : s = '{') << k.to_s << '=>' << v.to_s}
|
193
|
+
s ? (s << '}') : '{}'
|
194
|
+
end
|
195
|
+
|
196
|
+
alias inspect to_s
|
197
|
+
|
198
|
+
FETCH = Proc.new {|k| raise KeyError, 'key not found'}
|
199
|
+
|
200
|
+
# A single node in the doubly linked LRU list of nodes
|
201
|
+
Node = Struct.new :key, :value, :pred, :succ do
|
202
|
+
def unlink
|
203
|
+
pred.succ = succ if pred
|
204
|
+
succ.pred = pred if succ
|
205
|
+
self.succ = self.pred = nil
|
206
|
+
self
|
207
|
+
end
|
208
|
+
|
209
|
+
def insert_after(node)
|
210
|
+
raise 'Cannot insert after self' if equal? node
|
211
|
+
return self if node.succ.equal? self
|
212
|
+
|
213
|
+
unlink
|
214
|
+
|
215
|
+
self.succ = node.succ
|
216
|
+
self.pred = node
|
217
|
+
|
218
|
+
node.succ.pred = self if node.succ
|
219
|
+
node.succ = self
|
220
|
+
|
221
|
+
self
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
private
|
226
|
+
# iterate nodes
|
227
|
+
def each_node
|
228
|
+
n = @head.succ
|
229
|
+
|
230
|
+
until n.equal? @tail
|
231
|
+
succ = n.succ
|
232
|
+
yield n
|
233
|
+
n = succ
|
234
|
+
end
|
235
|
+
|
236
|
+
self
|
237
|
+
end
|
238
|
+
|
239
|
+
# move node to front
|
240
|
+
def front(node)
|
241
|
+
node.insert_after(@head)
|
242
|
+
end
|
243
|
+
|
244
|
+
# remove the node and invoke release_proc
|
245
|
+
# if set
|
246
|
+
def remove_node(node)
|
247
|
+
n = @h.delete(node.key)
|
248
|
+
n.unlink
|
249
|
+
release_proc and release_proc[n.key, n.value]
|
250
|
+
n
|
251
|
+
end
|
252
|
+
|
253
|
+
# remove the oldest node returning the node
|
254
|
+
def delete_oldest
|
255
|
+
n = @tail.pred
|
256
|
+
raise "Cannot delete from empty hash" if @head.equal? n
|
257
|
+
remove_node n
|
258
|
+
end
|
259
|
+
|
260
|
+
# Normalize the argument in order to be usable as max_size
|
261
|
+
# criterion is that n.to_i must be an Integer and it must
|
262
|
+
# be larger than zero.
|
263
|
+
def normalize_max(n)
|
264
|
+
n = n.to_i
|
265
|
+
raise ArgumentError, 'Invalid max_size: %p' % n unless Integer === n && n > 0
|
266
|
+
n
|
267
|
+
end
|
268
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble2_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
FASTAFILE2="test/data/gff/MhA1_Contig125.fa"
|
14
|
+
GFF3FILE2="test/data/gff/MhA1_Contig125.gff3"
|
15
|
+
|
16
|
+
PROTEINS = {
|
17
|
+
"cds:MhA1_Contig125.frz3.gene2" =>
|
18
|
+
"MNDLVNQFKSAALAVGQYLTPVLRESKFKETGVLTPEEFVAAGDHLVHLCPTWSWAKASDSNGQTTFLITKQSALVTQRCAQIMGYDEILKEKIIKDESAETGDEQNEWVDTHHFDFETNCAPKDFEEEENKVEDIKENNLNEEENCEEEEEGEPIDLDEYLSSGLLEEEDPARFVLQNKSLKETKDDSTSNNLLRTRRYDLHITYDKYYQVPRFWLVGYDENGSPLAVDKMKEDFSQEHADKTITLESHPHISGLTLATIHPCRHAPVMKRLIEQFQESGKELLVIDYLFVFLKFVQAVIPTVEYDYTRSIHF*",
|
19
|
+
"cds:MhA1_Contig125.frz3.gene3" =>
|
20
|
+
"MERRKVSNTDPFEAAEGMLRWNSDIIKDKEIKQFKGLKKPLKLSENQNDEYDVDPFEAVTDWLPLNKNVDKT*",
|
21
|
+
"cds:MhA1_Contig125.frz3.gene4" =>
|
22
|
+
"MKSTKMSATEIVSYHLYSLHTLSSFCLTENPENIFIKDQNFQDFFLFCERVREQFNEAEELKTPLNTKISQTDSTNIQNKKDEPSISIGPCVNDLCPKGFECIENICFKSMEMPKTERVLSIGPCVNAKCPEGFSCYEDDRQCYAN*",
|
23
|
+
"cds:MhA1_Contig125.frz3.gene5" =>
|
24
|
+
"MRLDIFLVIAFSLGVAVNCGVVKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPSGGETGGAAAPAEAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAEAAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPAAGGCTEGCAAGGESAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAESAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPEQAAPAVAQPAPAAAPSQASGY*",
|
25
|
+
"cds:MhA1_Contig125.frz3.gene6" =>
|
26
|
+
"MDNAENEEKNDKENHEKPIKFEINNQKQFFKKEDEVKECEEESKNVNEFEEDQGTNEVSEVLRLFRRGNVWAFALQNLDLMRAYVILSCLAIAVVMLANFLRNSRFFDFCLK*",
|
27
|
+
"cds:MhA1_Contig125.frz3.gene7" =>
|
28
|
+
"MLFCLLHPMDHNTGPLARKSSTLCSLLLLSIAALLVLAVPGQANSEEVGFGNHTKEKDGDEVTVNIDSVQAPDDLTYAVYEKRFKDVCEFVITKDDIELLYKGKGCTVELLTGENQDITFKTGVKDIGCVRNDCDKASLYSSVGEVEPGLSQSVTDGKTEFELRISGSEFNMNFEEDAPFNPQKNRCAPKQDHIVKPETWRIKNGELKDKHLLVFHLLPKTATREYTKEGKISKEQPPEEAPKCKLFIRFKRPYYEFLYVGPITTTVTTTTTTTTTPSSGLVGQGPTPKTGTHQGNTPKVQGKGSEKESDNTMMIVIIVIVVVVVVLVIGVVLIFILKNKGSKEDELQKVKQTTTKANKSSAVTL*",
|
29
|
+
"cds:MhA1_Contig125.frz3.gene8" =>
|
30
|
+
"MRQRESVILNKTENQTQIFEKLLNLYNSPKDVVNLRNNPEQLIQLGIDSKQFSAILEMMFGARRRNSLRGDYREARRFRNRREYSAWWDAGEVNNWRINSRHPSRHGTVEYWRCAFAVGRFFTCPSRIRITFGFGDRYVIVANARNHPHNHNRQNNAGDNNPNTVRRALPMEANERLTARTVHIGPRPSTSAPNQPTTKGQAAPPRASVSTTSANAAATPTTSASSTVQKGTAAPSTSAAPSTSAAPSTSAASRPLKPPGFATAATSATNSQQAAAKPASNQQPAPTATTSQTSASAPGTSSKPPPTTSPAPAATPAPATSQPGTSTVKSAPASTPTPLKPPAATEKQTSQPPSAAPGTTATIKPVLVTNIPGLPPGIPTSATGSGTINVSLTALDAFLAGHPRPASTSAPASSQTTPGPASQPSSAPVTQNKGKEEKKEDKKEEKKEGKKEEKKEGKKEEKRGPG*",
|
31
|
+
"cds:MhA1_Contig125.frz3.gene9" =>
|
32
|
+
"MHGATIGNRLRATRRSRDAQMMAAAESVARLSRRHSHQKAIRRVLPPPPALNSSRDSQPINPFCSDPSSIQPVIAKGVCVRSVGVFKSALPPSTPFPSTSTAPNIPSDNTFVPHLNNSTPLHNNHHRTLGGSENCLNYQQQYIGGSYSARSQQQHPPPPAPSSCCISPFKPLEILGNSNGTTDSSSGGCNSARAAMHRQFTGSSNGEEEFTVEELQEFAQAFKMFDKDGNGTMSIKELGVAMRTLGLNPTEDELLNMVNEYDVDGNGIDFSEFCKMMKEMNKETDQELIRLAFRVFDKDGNGYITAQEFRHFMTIDYEEFVNAVAPIVNDGAKEDAPFFEKEQPTSFGQPITSGPPLASGKAKHF*",
|
33
|
+
"cds:MhA1_Contig125.frz3.gene21" =>
|
34
|
+
"MDVKPPPSAPQDIKEAIKESNMSTWRPFLIGNRMRTTSEDSAESFDAYDKSFDAYDVGNKKERRLSITEQFFGSSMPGRLRSNSTTEYEGHEHEPTFKKVDLKQFMKHQRKILGDDEWQ*",
|
35
|
+
"cds:MhA1_Contig125.frz3.gene22" =>
|
36
|
+
"MAKALISGFVSSGFISKSNISICTRSEATAKSWRLQGFTSAYSKDVFYSEVKKPRAIILIAVKPQIFPSFINEVKANEWFYFGVPGILCISIMSGISLQHFDKEMKSVGFDGHSMRLMPNVNCAVSTGTLVLSADPETPQELVTLVSVLSSYVGKCIRVDEAHFNAASSISGCGPAFIALVIEALADGGVVAGLSRELANQLAADTVKGTGHLFMTKMASVSPTSDNPSPAQLKDQVCSPAGTTIEGVRELEKHGVRSAFIEAIQASTRRAFELSQ*" }
|
37
|
+
|
38
|
+
describe GFFdb, "Assemble CDS (Contig125)" do
|
39
|
+
before :all do
|
40
|
+
# gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2,:cache_components => :cache_none, :cache_records => :cache_none)
|
41
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2)
|
42
|
+
@gff = gffdb.assembler
|
43
|
+
@gff.parse
|
44
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig125"]
|
45
|
+
@componentlist = {}
|
46
|
+
@cdslist = {}
|
47
|
+
@gff.each_CDS do | id, reclist, component |
|
48
|
+
@componentlist[id] = component
|
49
|
+
@cdslist[id] = reclist
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should have the single contig" do
|
54
|
+
@gff.sequencelist.size.should == 1
|
55
|
+
@gff.sequencelist["MhA1_Contig125"].should_not == nil
|
56
|
+
@gff.sequencelist["MhA1_Contig125"].size.should == 53702
|
57
|
+
end
|
58
|
+
PROTEINS.each do | name, seq |
|
59
|
+
it "should translate gene #{name}" do
|
60
|
+
recs = @cdslist[name]
|
61
|
+
component = @componentlist[name]
|
62
|
+
cds0 = recs[0]
|
63
|
+
cds0.seqname.should == 'MhA1_Contig125'
|
64
|
+
# ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
65
|
+
# p [name, ntseq]
|
66
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
67
|
+
aaseq.should == seq
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble3_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
GFF3FILE3="test/data/gff/test-cds.gff3"
|
14
|
+
|
15
|
+
describe GFFdb, "Assemble CDS (extra checks)" do
|
16
|
+
before :all do
|
17
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE3)
|
18
|
+
@gff = gffdb.assembler
|
19
|
+
@gff.parse
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should translate gene MhA1_Contig1040.frz3.gene29" do
|
23
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig1040"]
|
24
|
+
@componentlist = {}
|
25
|
+
@cdslist = {}
|
26
|
+
@gff.each_CDS do | id, reclist, component |
|
27
|
+
@componentlist[id] = component
|
28
|
+
@cdslist[id] = reclist
|
29
|
+
end
|
30
|
+
name = "cds:MhA1_Contig1040.frz3.gene"
|
31
|
+
recs = @cdslist[name]
|
32
|
+
component = @componentlist[name]
|
33
|
+
p recs
|
34
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
35
|
+
ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
|
36
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
37
|
+
ntseq.should == "TTATCAACAAATTTAATATTTTCTAATAATACTAATAATATGGCAATTTCAATTCACCCAAAAAAGAATTCTAATGAGGATATTCCCCCATCAACATTATTAACATATCGTTGGTTTCTCTCTTACCGTATGATGACTGCAAGCATGTTATGCCTTTGTTTTTCTAGGCAAATTAATTAA"
|
38
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
39
|
+
aaseq.should == "LSTNLIFSNNTNNMAISIHPKKNSNEDIPPSTLLTYRWFLSYRMMTASMLCLCFSRQIN*"
|
40
|
+
end
|
41
|
+
it "should translate gene MhA1_Contig2992.frz3.gene1" do
|
42
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig2992"]
|
43
|
+
@componentlist = {}
|
44
|
+
@cdslist = {}
|
45
|
+
@gff.each_CDS do | id, reclist, component |
|
46
|
+
@componentlist[id] = component
|
47
|
+
@cdslist[id] = reclist
|
48
|
+
end
|
49
|
+
name = "cds:MhA1_Contig2992.frz3.gene1"
|
50
|
+
recs = @cdslist[name]
|
51
|
+
component = @componentlist[name]
|
52
|
+
# ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
53
|
+
# ntseq.should == ""
|
54
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
55
|
+
ntseq.should == "AAAATTAATAAAAAAATAAATGATAATTCTTTTAATATTCAATCTGATTCGAATGAAAATTTGTTTAATGATGGAATTAATTCTGAACAAAATGAAGACAATATAGCAACAAAAAAAGGCAACAAAAAATTCGGTAAAAATCAAAAAGAAGGAAATAAAGAGTTGGATATTCAAAGTGAAGGTTTTGATAATAATGAAATACCTTCAAAAGAAAGCAAAAAACAAATAAGTAATTTTGGGGATAATGAAAGTGAATATGAAAAAGAAGAGGATAATAGAAAAAAGAAAGGGAAAAAAGGAATGATAGAAAAGTATGAATTAGGAAGGAATAAAGGAAGGGATAAAAATGAAAGAAATAAGGCTTCTGAAAGGTTTGATGAGCAGAATCAAGACAGAAATAATCAACGTGATAGTTTTGATTCTGGCAATAATGATAAATCACAAAGAGGCTTAGATAGCGGCACATTAGATGGAACAAATAATTTAAAAAGATCGAATGATGATCAATTACCAGAATTTTTGAAAACGGCCAGTCTCTCAGAGCGTCAGAAATTTCTTCAACTTGAAGCAGAAAATGACAGGTCCAAGTCTTCTATACGAAGAGATAAACAGAATTGGGCTGATCAACAAGGGCAGAGAATTTCTGATCTTTATAAACAATTTCAACAATCTTTACAACAAAAAGAAAAACAATTTAAAAGTGAACGTCAACGAAATGTTCAAATTAAATTAAGCAGAAATGCACAGAATGTTGATAAAAGAATTCAGGATCTTCTGAATAATCCTGATATTGCTGAAAGAGCTTTAATTCTTCAAATTGAACAAATCCTCGGCGGTACAGACGATAGTATTCGTCAGGAATTACAAAGACAAATATCTGTTATTGGACCATTAGATGGAAATATACCGCCAAATCTTACATAG"
|
56
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
57
|
+
aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
|
@@ -0,0 +1,291 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
FASTAFILE="test/data/gff/MhA1_Contig1133.fa"
|
14
|
+
GFF3FILE="test/data/gff/MhA1_Contig1133.gff3"
|
15
|
+
|
16
|
+
describe GFFdb, "Assemble CDS" do
|
17
|
+
before :all do
|
18
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE, :fasta_filename => FASTAFILE)
|
19
|
+
@gff = gffdb.assembler
|
20
|
+
@gff.parse
|
21
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig1133"]
|
22
|
+
@componentlist = {}
|
23
|
+
@cdslist = {}
|
24
|
+
@gff.each_CDS do | id, reclist, component |
|
25
|
+
@componentlist[id] = component
|
26
|
+
@cdslist[id] = reclist
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should have the single contig" do
|
31
|
+
@gff.sequencelist.size.should == 1
|
32
|
+
@gff.sequencelist["MhA1_Contig1133"].should_not == nil
|
33
|
+
@gff.sequencelist["MhA1_Contig1133"].size.should == 33905
|
34
|
+
end
|
35
|
+
it "should have a container component" do
|
36
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
37
|
+
component.start.should == 7838
|
38
|
+
component.end.should == 8740
|
39
|
+
end
|
40
|
+
it "should have CDS 7838:7980" do
|
41
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
42
|
+
cds0 = recs[0]
|
43
|
+
cds0.start.should == 7838
|
44
|
+
cds0.end.should == 7980
|
45
|
+
cds0.frame.should == 0
|
46
|
+
cds0.seqname.should == 'MhA1_Contig1133'
|
47
|
+
end
|
48
|
+
it "should have CDS 8065:8308" do
|
49
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
50
|
+
cds1 = recs[1]
|
51
|
+
cds1.start.should == 8065
|
52
|
+
cds1.end.should == 8308
|
53
|
+
cds1.frame.should == 1
|
54
|
+
cds1.strand.should == '+'
|
55
|
+
cds1.seqname.should == 'MhA1_Contig1133'
|
56
|
+
end
|
57
|
+
# From Wormbase website http://www.wormbase.org/db/gb2/gbrowse/m_hapla/?name=MhA1_Contig1133%3A7838..8740
|
58
|
+
# >MhA1_Contig1133:7838..8740
|
59
|
+
# atgcgtcctttaacagatgaagaaactgaaaagtttttcaaaaaactttcaaattatatt
|
60
|
+
# ggtgacaatattaaacttttattggaaagagaagatggagaatatgtttttcgtttacat
|
61
|
+
# aaagacagagtttattattgcaggtttttttaaaattattttatatttaaattaggtctc
|
62
|
+
# aatctttataggggattttgtttttgttatttttttttggtttttag>tgaaaaattaatg
|
63
|
+
# cgacaagcagcatgtattggacgtaaacaattgggatcttttggaacttgtttgggtaaa
|
64
|
+
# ttcacaaaaggagggtctttctttcttcatataacatcattggattatttggcaccttat
|
65
|
+
# gctttagcaaaaatttggttaaaaccacaagctgaacaacaatttttatatggaaataat
|
66
|
+
# attgttaaatctggtgttggaagaatgagtgaagggattgaagaaaaacaagtaaatatt
|
67
|
+
# taattattttttttaaaatggattcctttacttctcaattaaatattaaaagcatatctg
|
68
|
+
# tagaagaggttatttatctttaaatcgaaatatacaggaataaataaaaatttaagaaat
|
69
|
+
# cataatttagaattctttttctggttatgttagattatttttaaatttttttgtaatttt
|
70
|
+
# tttttcgtaatttttttatgagcaaatcccttctctcttaaatattttaataaaaatcta
|
71
|
+
# attttataaattataattattttttagggtattattatttataatatgtcagatttacca
|
72
|
+
# ttgggttttggagtggctgcaaagggaacattatcttgtagaaaagtagatcctacagct
|
73
|
+
# ttagttgttttacatcaatcagatttgggtgaatatattcgaaatgaagagggattaatt
|
74
|
+
|
75
|
+
it "should translate CDS 7838:7980 (in frame 0, + strand)" do
|
76
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
77
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
78
|
+
cds0 = recs[0]
|
79
|
+
cds0.seqname.should == 'MhA1_Contig1133'
|
80
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds0])
|
81
|
+
seq.size.should == 143
|
82
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
|
83
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
|
84
|
+
aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYC"
|
85
|
+
end
|
86
|
+
it "should translate CDS 8065:8308 (in frame 1, + strand)" do
|
87
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
88
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
89
|
+
cds1 = recs[1]
|
90
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1], :phase => false)
|
91
|
+
seq.size.should == 244
|
92
|
+
seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
93
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1])
|
94
|
+
seq.should == "GAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
95
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
96
|
+
# note it should handle the frame shift and direction!
|
97
|
+
aaseq.should == "EKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQ"
|
98
|
+
end
|
99
|
+
it "should translate CDS3 (in frame 0, + strand)" do
|
100
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
101
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
102
|
+
cds3 = recs[2]
|
103
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds3], :phase => false)
|
104
|
+
seq.size.should == 156
|
105
|
+
seq.should == "GGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
106
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds3])
|
107
|
+
# note it should handle the frame shift and direction!
|
108
|
+
aaseq.should == "GIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
|
109
|
+
end
|
110
|
+
it "should assemble 3 CDSs for MhA1_Contig1133.frz3.gene4" do
|
111
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
112
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
113
|
+
seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>false)
|
114
|
+
seq.size.should == 543
|
115
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
116
|
+
seq = @gff.assemble(@contigsequence,component.start,recs)
|
117
|
+
seq.size.should == 543
|
118
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
119
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
120
|
+
aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYCSEKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQGIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
|
121
|
+
end
|
122
|
+
# > class=Sequence position=MhA1_Contig1133:27463..29904 (- strand); shown in frame 1!
|
123
|
+
# ATGGACCATC ATGCATTGGT GGAGGAATTA CCAGAAATTG AAAAATTAAC TCCTCAAGAA CGTATTGCAT TAGCTAGAGA
|
124
|
+
# ACGCCGTGCT GAACAACTTC GACAGAATGC TGCACGGGAG GCTCAATTGC CAATGCCTGC ACAGCGCCGG CCTCGTCTTC
|
125
|
+
# GATTTACACC AGATGTTGCT TTACTTGAGG CAACAGTTAG GGGTGATACC CAAGAAGGTT ATACATAAAG ATTATTGATT
|
126
|
+
# TTAAATGAAT TTATTTATTT TTTAGTTGAA AGACTTTTAA TGGAAGGTGT CAATGCTGAT TCACATAATG AGGATGGATT
|
127
|
+
# AACACCTTTA CATCAGGCAA AAACCAAATT AATTTTTTTA AATTTATTTT TAGTGTGCCA TTGACAATAA TGAAAGAATT
|
128
|
+
# GTTCGTCTTC TGCTTAGGTA CGGAGCTTGT GTTAATGCCA AAGACACTGA ACTTTGGACA CCATTGCACG CAGCTGCATG
|
129
|
+
# TTGTGCTTAT ATTGATATTG TTCGATTGCT TATTGCACAG TTAGTTTTTT TTTAATTTTT TTTTTAAATA AATTTCTTAA
|
130
|
+
# GTTTTACAGA AATATTTATT TTAAACAAAC GGGACTTCCT TTTAAATTTT TTGTATTTTT AATCTTTACG TATTTTCATT
|
131
|
+
# TAATAATTAA TTCGTCTTCT AAAAGTTCGT AAGTTTTGTG GTTTAGTTTA ATGGGTAAAC ATCCAGTTTT TAGGTCATCG
|
132
|
+
# ATTTTTATTT TTGCGTCATA TTTTATCGAA AACTTCTTTC ATATTAAAAA TTTCTTTTTA AGCAACGCAG ATTTACTAGC
|
133
|
+
# AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG
|
134
|
+
# CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA
|
135
|
+
# CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT
|
136
|
+
# TTATGTAAAG ATTATTATGA AAGGATTATT ACAGTTTTAT TCCTTTTTAG TTACATATAG CAGCAGCTAA TGGTTATTAT
|
137
|
+
# GATGTTGCTG CTTTCCTTCT TCGTTGTAAT GTTTCTCCAG CATTGAGAGA TATAGATTTG TGGCAACCAA TTCATGCAGC
|
138
|
+
# TGCTTCTTGG AATCAACCAG ACTTAATCGA GCTTTTATGC GAATATGGGG CTGATATAAA TGCAAAAACT GGAGCTGGGG
|
139
|
+
# AAAGCCCTTT AGGTTTATTT TATTGAATCT TATAATTTAT AAATATTTGC TATTAAGTAT GAGGGGAGAG GAACTAACAA
|
140
|
+
# TAAGGAATTA AATTTCTCAA TATCAGGATT TTTCGGTTCA CACCCATTTT CTTAAGACCT TTAATTTTTC TCAAAATATG
|
141
|
+
# TATGTGACCA CGTCGGGAGG CTTTTTTATT TTTACATGGC TATTTTAAGA AAGGCTAGAA TTTTGACATA CTTTTAACTT
|
142
|
+
# ATCGCCTTCC TAACTATTTT CTGTCTATAT ATTTTTTTAA ATTAAGAATT AACTGAAGAT GAACCAACCC AACAAGTAAT
|
143
|
+
# TAGAACAATC GCTCAGACAG AAGCAAGGAG ACGGCGTGGT CCAGGTGGTG GTTACTTTGG TGTTCGTGAT TCTCGACGAC
|
144
|
+
# AAAGCCGAAA GTAATTTTAA ATTTATATTT TCTTTTCATC TTTTTATCTA GAAGAAAAAA GTTTGAATCT CCTCAACAAC
|
145
|
+
# CACCTTCAAC ATTAGAAAAT CCTTTCTCAG CTAGAGGTGC AATTAGACGA CAATCATTGC GAGATCGTAG TGGAATGTCA
|
146
|
+
# TTAGCTCGTT TGGAAGCACA AAGAGAGGGT TCTGACCTTA TTAGAAGTTA TAATAGTAAA GAAGACCTTT CTTCTAATAC
|
147
|
+
# AGCGGTTTGT TTTTTAAAAT TGTAATTTTT TCTTAATTTT TAGGATGATT CTTTAAATGT TGGAAGTTCT TCATATCTCA
|
148
|
+
# ACAATCCAAC AGCCTCGGCT AGTGCTTCCT CTTCAGCATT ACACGGAACT CCACATCAAC AACAACGTCG TGAATCTCCA
|
149
|
+
# CCTAAACGTG CATTAATGGC TAGAAGTGCT TCTCATCAAA AACAAAAACA ACAAATGTCT CCAGATGAAT GGCTGAAAAA
|
150
|
+
# ATTAGAAGCA GATTCTGCAG GTTTTCGAGA TAATGATGGA GAAGATGGTG AATTACAATC TGAACTTAAA GGAGGACAAA
|
151
|
+
# GAATGAAGAG TGGTGGTGGT GGAGGAGCGA GAGGTCAGCA AGGTGAATTA AAATATTTTT TTTGAATTTT ATATTTATTT
|
152
|
+
# TTCGTTTAAT AGAAATGAAT GGTGGTCCAA CAGCAACATT TGGTGGAGCT TCAAAACAAC AATTAGCAAT GGGCTCTGGA
|
153
|
+
# CCCAATAGAC GGCGCAAACA AGGATGTTGC TCTGTTTTGT GA
|
154
|
+
it "should assemble a reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
155
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
156
|
+
recs.size.should == 8
|
157
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
158
|
+
# 193 bp from MhA1_Contig1133:27,981..28,173
|
159
|
+
# >MhA1_Contig1133:27981..28173
|
160
|
+
# cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
|
161
|
+
# tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
|
162
|
+
# aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
|
163
|
+
# aaacttttttctt
|
164
|
+
cds1 = recs[5]
|
165
|
+
cds1.start.should == 27981
|
166
|
+
cds1.frame.should == 1
|
167
|
+
cds1.strand.should == '-'
|
168
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true)
|
169
|
+
seq.should == "TCTTTTTTCAAACTTAGAGGAGTTGTTGGTGGAAGTTGTAATCTTTTAGGAAAGAGTCGATCTCCACGTTAATCTGCTGTTAGTAACGCTCTAGCATCACCTTACAGTAATCGAGCAAACCTTCGTGTTTCTCTCCCAAGACTGGAATAATCTTCAATATTATCATTTCTTCTGGAAAGAAGATTATGTCGC"
|
170
|
+
seq.size.should == 192
|
171
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true,:complement=>true)
|
172
|
+
seq.should == "AGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCG"
|
173
|
+
seq.size.should == 192
|
174
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
175
|
+
# note it should handle the frame shift and direction!
|
176
|
+
# >EMBOSS_001_4
|
177
|
+
# RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
|
178
|
+
aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
|
179
|
+
end
|
180
|
+
it "should take the 6th CDS in MhA1_Contig1133.frz3.gene11 (which is 3rd on DNA)" do
|
181
|
+
# >MhA1_Contig1133:27981..28173
|
182
|
+
# cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
|
183
|
+
# tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
|
184
|
+
# aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
|
185
|
+
# aaacttttttctt
|
186
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
187
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
188
|
+
cds2 = recs[5].clone
|
189
|
+
# p cds2
|
190
|
+
cds2.start.should == 27981
|
191
|
+
cds2.frame.should == 1
|
192
|
+
cds2.strand.should == '-'
|
193
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2],:complement=>true)
|
194
|
+
seq.should == "GCGACATAATCTTCTTTCCAGAAGAAATGATAATATTGAAGATTATTCCAGTCTTGGGAGAGAAACACGAAGGTTTGCTCGATTACTGTAAGGTGATGCTAGAGCGTTACTAACAGCAGATTAACGTGGAGATCGACTCTTTCCTAAAAGATTACAACTTCCACCAACAACTCCTCTAAGTTTGAAAAAAGAA"
|
195
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
|
196
|
+
# note it should handle the frame shift and direction!
|
197
|
+
# >27981..28173_4 RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
|
198
|
+
aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
|
199
|
+
end
|
200
|
+
it "should assemble the 1st reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
201
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
202
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
203
|
+
cds1 = recs[0].clone
|
204
|
+
cds1.start.should == 29710
|
205
|
+
cds1.frame.should == 0
|
206
|
+
cds1.strand.should == '-'
|
207
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:raw=>true)
|
208
|
+
seq.size.should == 195
|
209
|
+
seq.should == "TGTTGCCTCAAGTAAAGCAACATCTGGTGTAAATCGAAGACGAGGCCGGCGCTGTGCAGGCATTGGCAATTGAGCCTCCCGTGCAGCATTCTGTCGAAGTTGTTCAGCACGGCGTTCTCTAGCTAATGCAATACGTTCTTGAGGAGTTAATTTTTCAATTTCTGGTAATTCCTCCACCAATGCATGATGGTCCAT"
|
210
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:codonize=>true)
|
211
|
+
seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACA"
|
212
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
213
|
+
aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEAT"
|
214
|
+
end
|
215
|
+
it "should assemble the 3rd reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
216
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
217
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
218
|
+
cds2 = recs[2].clone
|
219
|
+
# CAACGCAG ATTTACTAGC AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT TTAT
|
220
|
+
# p cds2
|
221
|
+
cds2.frame.should == 1
|
222
|
+
cds2.strand.should == '-'
|
223
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2], :raw=>true)
|
224
|
+
seq.should == "ATAAATTTCCCTTTCTCCAGAAAAACTTACAAAAGTAGATTTATCAACAGAATTTCTTTGATCTAAAGGTAATCCTCTTTGATGTAAAATTTTCATATCATTTAACATTTCCCTTTCTGGTTGTTGTCTTCTTTCATCAATCATTTCTTGTGTAATTCCTCTAGCAGCCATTTCAGATTCAATAAGGTCAAGGGTTTGTTCATCATCACAAATATCATAAGGCATATTACCATCTGCATTTACTGCTAGTAAATCTGCGTTG"
|
225
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2], :codonize=>true)
|
226
|
+
seq.should == "AACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTAT"
|
227
|
+
# cds1.frame = 1
|
228
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
|
229
|
+
# note it should handle the frame shift and direction!
|
230
|
+
aaseq.should == "NADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIY"
|
231
|
+
end
|
232
|
+
it "should assemble the protein sequence for MhA1_Contig1133.frz3.gene11" do
|
233
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
234
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
235
|
+
seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>true, :reverse=>true, :complement=>true)
|
236
|
+
seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACATGTGCCATTGACAATAATGAAAGAATTGTTCGTCTTCTGCTTAGGTACGGAGCTTGTGTTAATGCCAAAGACACTGAACTTTGGACACCATTGCACGCAGCTGCATGTTGTGCTTATATTGATATTGTTCGATTGCTTATTGCACACAACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTATTTACATATAGCAGCAGCTAATGGTTATTATGATGTTGCTGCTTTCCTTCTTCGTTGTAATGTTTCTCCAGCATTGAGAGATATAGATTTGTGGCAACCAATTCATGCAGCTGCTTCTTGGAATCAACCAGACTTAATCGAGCTTTTATGCGAATATGGGGCTGATATAAATGCAAAAACTGGAGCTGGGGAAAGCCCTTTAGAATTAACTGAAGATGAACCAACCCAACAAGTAATTAGAACAATCGCTCAGACAGAAGCAAGGAGACGGCGTGGTCCAGGTGGTGGTTACTTTGGTGTTCGTGATTCTCGACGACAAAGCCGAAAAAGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCGGATGATTCTTTAAATGTTGGAAGTTCTTCATATCTCAACAATCCAACAGCCTCGGCTAGTGCTTCCTCTTCAGCATTACACGGAACTCCACATCAACAACAACGTCGTGAATCTCCACCTAAACGTGCATTAATGGCTAGAAGTGCTTCTCATCAAAAACAAAAACAACAAATGTCTCCAGATGAATGGCTGAAAAAATTAGAAGCAGATTCTGCAGGTTTTCGAGATAATGATGGAGAAGATGGTGAATTACAATCTGAACTTAAAGGAGGACAAAGAATGAAGAGTGGTGGTGGTGGAGGAGCGAGAGGTCAGCAAGAAATGAATGGTGGTCCAACAGCAACATTTGGTGGAGCTTCAAAACAACAATTAGCAATGGGCTCTGGACCCAATAGACGGCGCAAACAAGGATGTTGCTCTGTTTTGTGA"
|
237
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
238
|
+
aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAAASWNQPDLIELLCEYGADINAKTGAGESPLELTEDEPTQQVIRTIAQTEARRRRGPGGGYFGVRDSRRQSRKRKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTADDSLNVGSSSYLNNPTASASASSSALHGTPHQQQRRESPPKRALMARSASHQKQKQQMSPDEWLKKLEADSAGFRDNDGEDGELQSELKGGQRMKSGGGGGARGQQEMNGGPTATFGGASKQQLAMGSGPNRRRKQGCCSVL*"
|
239
|
+
# >EMBOSS_001_1
|
240
|
+
# MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVA
|
241
|
+
# LLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLA
|
242
|
+
# VNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPL
|
243
|
+
# DQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAA...
|
244
|
+
|
245
|
+
end
|
246
|
+
it "should assemble exons into"
|
247
|
+
it "should assemble the gene into"
|
248
|
+
# >MhA1_Contig1133:27463..29904
|
249
|
+
# tcacaaaacagagcaacatccttgtttgcgccgtctattgggtccagagcccattgctaa
|
250
|
+
# ttgttgttttgaagctccaccaaatgttgctgttggaccaccattcatttctattaaacg
|
251
|
+
# aaaaataaatataaaattcaaaaaaaatattttaattcaccttgctgacctctcgctcct
|
252
|
+
# ccaccaccaccactcttcattctttgtcctcctttaagttcagattgtaattcaccatct
|
253
|
+
# tctccatcattatctcgaaaacctgcagaatctgcttctaattttttcagccattcatct
|
254
|
+
# ggagacatttgttgtttttgtttttgatgagaagcacttctagccattaatgcacgttta
|
255
|
+
# ggtggagattcacgacgttgttgttgatgtggagttccgtgtaatgctgaagaggaagca
|
256
|
+
# ctagccgaggctgttggattgttgagatatgaagaacttccaacatttaaagaatcatcc
|
257
|
+
# taaaaattaagaaaaaattacaattttaaaaaacaaaccgctgtattagaagaaaggtct
|
258
|
+
# tctttactattataacttctaataaggtcagaaccctctctttgtgcttccaaacgagct
|
259
|
+
# aatgacattccactacgatctcgcaatgattgtcgtctaattgcacctctagctgagaaa
|
260
|
+
# ggattttctaatgttgaaggtggttgttgaggagattcaaacttttttcttctagataaa
|
261
|
+
# aagatgaaaagaaaatataaatttaaaattactttcggctttgtcgtcgagaatcacgaa
|
262
|
+
# caccaaagtaaccaccacctggaccacgccgtctccttgcttctgtctgagcgattgttc
|
263
|
+
# taattacttgttgggttggttcatcttcagttaattcttaatttaaaaaaatatatagac
|
264
|
+
# agaaaatagttaggaaggcgataagttaaaagtatgtcaaaattctagcctttcttaaaa
|
265
|
+
# tagccatgtaaaaataaaaaagcctcccgacgtggtcacatacatattttgagaaaaatt
|
266
|
+
# aaaggtcttaagaaaatgggtgtgaaccgaaaaatcctgatattgagaaatttaattcct
|
267
|
+
# tattgttagttcctctcccctcatacttaatagcaaatatttataaattataagattcaa
|
268
|
+
# taaaataaacctaaagggctttccccagctccagtttttgcatttatatcagccccatat
|
269
|
+
# tcgcataaaagctcgattaagtctggttgattccaagaagcagctgcatgaattggttgc
|
270
|
+
# cacaaatctatatctctcaatgctggagaaacattacaacgaagaaggaaagcagcaaca
|
271
|
+
# tcataataaccattagctgctgctatatgtaactaaaaaggaataaaactgtaataatcc
|
272
|
+
# tttcataataatctttacataaatttccctttctccagaaaaacttacaaaagtagattt
|
273
|
+
# atcaacagaatttctttgatctaaaggtaatcctctttgatgtaaaattttcatatcatt
|
274
|
+
# taacatttccctttctggttgttgtcttctttcatcaatcatttcttgtgtaattcctct
|
275
|
+
# agcagccatttcagattcaataaggtcaagggtttgttcatcatcacaaatatcataagg
|
276
|
+
# catattaccatctgcatttactgctagtaaatctgcgttgcttaaaaagaaatttttaat
|
277
|
+
# atgaaagaagttttcgataaaatatgacgcaaaaataaaaatcgatgacctaaaaactgg
|
278
|
+
# atgtttacccattaaactaaaccacaaaacttacgaacttttagaagacgaattaattat
|
279
|
+
# taaatgaaaatacgtaaagattaaaaatacaaaaaatttaaaaggaagtcccgtttgttt
|
280
|
+
# aaaataaatatttctgtaaaacttaagaaatttatttaaaaaaaaaattaaaaaaaaact
|
281
|
+
# aactgtgcaataagcaatcgaacaatatcaatataagcacaacatgcagctgcgtgcaat
|
282
|
+
# ggtgtccaaagttcagtgtctttggcattaacacaagctccgtacctaagcagaagacga
|
283
|
+
# acaattctttcattattgtcaatggcacactaaaaataaatttaaaaaaattaatttggt
|
284
|
+
# ttttgcctgatgtaaaggtgttaatccatcctcattatgtgaatcagcattgacaccttc
|
285
|
+
# cattaaaagtctttcaactaaaaaataaataaattcatttaaaatcaataatctttatgt
|
286
|
+
# ataaccttcttgggtatcacccctaactgttgcctcaagtaaagcaacatctggtgtaaa
|
287
|
+
# tcgaagacgaggccggcgctgtgcaggcattggcaattgagcctcccgtgcagcattctg
|
288
|
+
# tcgaagttgttcagcacggcgttctctagctaatgcaatacgttcttgaggagttaattt
|
289
|
+
# ttcaatttctggtaattcctccaccaatgcatgatggtccat
|
290
|
+
end
|
291
|
+
|