bio-gff3 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README +65 -0
- data/README.rdoc +19 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/gff3-fetch +99 -0
- data/bio-gff3.gemspec +101 -0
- data/lib/bio-gff3.rb +0 -0
- data/lib/bio/db/gff/gffassemble.rb +300 -0
- data/lib/bio/db/gff/gffdb.rb +40 -0
- data/lib/bio/db/gff/gfffasta.rb +68 -0
- data/lib/bio/db/gff/gfffileiterator.rb +77 -0
- data/lib/bio/db/gff/gffinmemory.rb +63 -0
- data/lib/bio/db/gff/gffnocache.rb +124 -0
- data/lib/bio/db/gff/gffparser.rb +154 -0
- data/lib/bio/system/lruhash.rb +268 -0
- data/spec/gff3_assemble2_spec.rb +73 -0
- data/spec/gff3_assemble3_spec.rb +62 -0
- data/spec/gff3_assemble_spec.rb +291 -0
- data/spec/gff3_fileiterator_spec.rb +43 -0
- data/spec/gffdb_spec.rb +99 -0
- data/test/data/gff/MhA1_Contig1133.fa +2 -0
- data/test/data/gff/MhA1_Contig1133.gff3 +1862 -0
- data/test/data/gff/MhA1_Contig125.fa +673 -0
- data/test/data/gff/MhA1_Contig125.gff3 +2177 -0
- data/test/data/gff/standard.gff3 +25 -0
- data/test/data/gff/test-cds.gff3 +98 -0
- data/test/data/gff/test-ext-fasta.fa +16 -0
- data/test/data/gff/test-ext-fasta.gff3 +57 -0
- data/test/data/gff/test.gff3 +74 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-gff3.rb +7 -0
- metadata +180 -0
@@ -0,0 +1,268 @@
|
|
1
|
+
# LRU based Hash by Robert Klemme
|
2
|
+
#
|
3
|
+
# Copied from http://github.com/rklemme/muppet-laboratories
|
4
|
+
#
|
5
|
+
# License: unclear, not in repository, need to ask Permission
|
6
|
+
# Added by Pjotr Prins (pjotr.prins@thebird.nl)
|
7
|
+
|
8
|
+
require 'enumerator'
|
9
|
+
|
10
|
+
# Hash with LRU expiry policy. There are at most max_size elements in a
|
11
|
+
# LruHash. When adding more elements old elements are removed according
|
12
|
+
# to LRU policy.
|
13
|
+
class LRUHash
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
attr_reader :max_size
|
17
|
+
attr_accessor :default, :default_proc, :release_proc
|
18
|
+
|
19
|
+
def initialize(max_size, default_value = nil, &block)
|
20
|
+
@max_size = normalize_max(max_size)
|
21
|
+
@default = default_value
|
22
|
+
@default_proc = block
|
23
|
+
|
24
|
+
@h = {}
|
25
|
+
@head = Node.new
|
26
|
+
@tail = front(Node.new)
|
27
|
+
end
|
28
|
+
|
29
|
+
def each_pair
|
30
|
+
if block_given?
|
31
|
+
each_node do |n|
|
32
|
+
yield [n.key, n.value]
|
33
|
+
end
|
34
|
+
else
|
35
|
+
enum_for :each_pair
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
alias each each_pair
|
40
|
+
|
41
|
+
def each_key
|
42
|
+
if block_given?
|
43
|
+
each_node do |n|
|
44
|
+
yield n.key
|
45
|
+
end
|
46
|
+
else
|
47
|
+
enum_for :each_key
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def each_value
|
52
|
+
if block_given?
|
53
|
+
each_node do |n|
|
54
|
+
yield n.value
|
55
|
+
end
|
56
|
+
else
|
57
|
+
enum_for :each_value
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def size
|
62
|
+
@h.size
|
63
|
+
end
|
64
|
+
|
65
|
+
def empty?
|
66
|
+
@head.succ.equal? @tail
|
67
|
+
end
|
68
|
+
|
69
|
+
def fetch(key, &b)
|
70
|
+
n = @h[key]
|
71
|
+
|
72
|
+
if n
|
73
|
+
front(n).value
|
74
|
+
else
|
75
|
+
(b || FETCH)[key]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def [](key)
|
80
|
+
fetch(key) do |k|
|
81
|
+
@default_proc ? @default_proc[self, k] : default
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def keys
|
86
|
+
@h.keys
|
87
|
+
end
|
88
|
+
|
89
|
+
def values
|
90
|
+
@h.map {|k,n| n.value}
|
91
|
+
end
|
92
|
+
|
93
|
+
def has_key?(key)
|
94
|
+
@h.has_key? key
|
95
|
+
end
|
96
|
+
|
97
|
+
alias key? has_key?
|
98
|
+
alias member? has_key?
|
99
|
+
alias include? has_key?
|
100
|
+
|
101
|
+
def has_value?(value)
|
102
|
+
each_pair do |k, v|
|
103
|
+
return true if value.eql? v
|
104
|
+
end
|
105
|
+
|
106
|
+
false
|
107
|
+
end
|
108
|
+
|
109
|
+
alias value? has_value?
|
110
|
+
|
111
|
+
def values_at(*key_list)
|
112
|
+
key_list.map {|k| self[k]}
|
113
|
+
end
|
114
|
+
|
115
|
+
def assoc(key)
|
116
|
+
n = @h[key]
|
117
|
+
|
118
|
+
if n
|
119
|
+
front(n)
|
120
|
+
[n.key, n.value]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def rassoc(value)
|
125
|
+
each_node do |n|
|
126
|
+
if value.eql? n.value
|
127
|
+
front(n)
|
128
|
+
return [n.key, n.value]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
nil
|
132
|
+
end
|
133
|
+
|
134
|
+
def key(value)
|
135
|
+
pair = rassoc(value) and pair.first
|
136
|
+
end
|
137
|
+
|
138
|
+
def store(key, value)
|
139
|
+
# same optimization as in Hash
|
140
|
+
key = key.dup.freeze if String === key && !key.frozen?
|
141
|
+
|
142
|
+
n = @h[key]
|
143
|
+
|
144
|
+
unless n
|
145
|
+
if size == max_size
|
146
|
+
# reuse node to optimize memory usage
|
147
|
+
n = delete_oldest
|
148
|
+
n.key = key
|
149
|
+
n.value = value
|
150
|
+
else
|
151
|
+
n = Node.new key, value
|
152
|
+
end
|
153
|
+
|
154
|
+
@h[key] = n
|
155
|
+
end
|
156
|
+
|
157
|
+
front(n).value = value
|
158
|
+
end
|
159
|
+
|
160
|
+
alias []= store
|
161
|
+
|
162
|
+
def delete(key)
|
163
|
+
n = @h[key] and remove_node(n).value
|
164
|
+
end
|
165
|
+
|
166
|
+
def delete_if
|
167
|
+
each_node do |n|
|
168
|
+
remove_node n if yield n.key, n.value
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def max_size=(limit)
|
173
|
+
limit = normalize_max(limit)
|
174
|
+
|
175
|
+
while size > limit
|
176
|
+
delete_oldest
|
177
|
+
end
|
178
|
+
|
179
|
+
@max_size = limit
|
180
|
+
end
|
181
|
+
|
182
|
+
def clear
|
183
|
+
until empty?
|
184
|
+
delete_oldest
|
185
|
+
end
|
186
|
+
|
187
|
+
self
|
188
|
+
end
|
189
|
+
|
190
|
+
def to_s
|
191
|
+
s = nil
|
192
|
+
each_pair {|k, v| (s ? (s << ', ') : s = '{') << k.to_s << '=>' << v.to_s}
|
193
|
+
s ? (s << '}') : '{}'
|
194
|
+
end
|
195
|
+
|
196
|
+
alias inspect to_s
|
197
|
+
|
198
|
+
FETCH = Proc.new {|k| raise KeyError, 'key not found'}
|
199
|
+
|
200
|
+
# A single node in the doubly linked LRU list of nodes
|
201
|
+
Node = Struct.new :key, :value, :pred, :succ do
|
202
|
+
def unlink
|
203
|
+
pred.succ = succ if pred
|
204
|
+
succ.pred = pred if succ
|
205
|
+
self.succ = self.pred = nil
|
206
|
+
self
|
207
|
+
end
|
208
|
+
|
209
|
+
def insert_after(node)
|
210
|
+
raise 'Cannot insert after self' if equal? node
|
211
|
+
return self if node.succ.equal? self
|
212
|
+
|
213
|
+
unlink
|
214
|
+
|
215
|
+
self.succ = node.succ
|
216
|
+
self.pred = node
|
217
|
+
|
218
|
+
node.succ.pred = self if node.succ
|
219
|
+
node.succ = self
|
220
|
+
|
221
|
+
self
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
private
|
226
|
+
# iterate nodes
|
227
|
+
def each_node
|
228
|
+
n = @head.succ
|
229
|
+
|
230
|
+
until n.equal? @tail
|
231
|
+
succ = n.succ
|
232
|
+
yield n
|
233
|
+
n = succ
|
234
|
+
end
|
235
|
+
|
236
|
+
self
|
237
|
+
end
|
238
|
+
|
239
|
+
# move node to front
|
240
|
+
def front(node)
|
241
|
+
node.insert_after(@head)
|
242
|
+
end
|
243
|
+
|
244
|
+
# remove the node and invoke release_proc
|
245
|
+
# if set
|
246
|
+
def remove_node(node)
|
247
|
+
n = @h.delete(node.key)
|
248
|
+
n.unlink
|
249
|
+
release_proc and release_proc[n.key, n.value]
|
250
|
+
n
|
251
|
+
end
|
252
|
+
|
253
|
+
# remove the oldest node returning the node
|
254
|
+
def delete_oldest
|
255
|
+
n = @tail.pred
|
256
|
+
raise "Cannot delete from empty hash" if @head.equal? n
|
257
|
+
remove_node n
|
258
|
+
end
|
259
|
+
|
260
|
+
# Normalize the argument in order to be usable as max_size
|
261
|
+
# criterion is that n.to_i must be an Integer and it must
|
262
|
+
# be larger than zero.
|
263
|
+
def normalize_max(n)
|
264
|
+
n = n.to_i
|
265
|
+
raise ArgumentError, 'Invalid max_size: %p' % n unless Integer === n && n > 0
|
266
|
+
n
|
267
|
+
end
|
268
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble2_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
FASTAFILE2="test/data/gff/MhA1_Contig125.fa"
|
14
|
+
GFF3FILE2="test/data/gff/MhA1_Contig125.gff3"
|
15
|
+
|
16
|
+
PROTEINS = {
|
17
|
+
"cds:MhA1_Contig125.frz3.gene2" =>
|
18
|
+
"MNDLVNQFKSAALAVGQYLTPVLRESKFKETGVLTPEEFVAAGDHLVHLCPTWSWAKASDSNGQTTFLITKQSALVTQRCAQIMGYDEILKEKIIKDESAETGDEQNEWVDTHHFDFETNCAPKDFEEEENKVEDIKENNLNEEENCEEEEEGEPIDLDEYLSSGLLEEEDPARFVLQNKSLKETKDDSTSNNLLRTRRYDLHITYDKYYQVPRFWLVGYDENGSPLAVDKMKEDFSQEHADKTITLESHPHISGLTLATIHPCRHAPVMKRLIEQFQESGKELLVIDYLFVFLKFVQAVIPTVEYDYTRSIHF*",
|
19
|
+
"cds:MhA1_Contig125.frz3.gene3" =>
|
20
|
+
"MERRKVSNTDPFEAAEGMLRWNSDIIKDKEIKQFKGLKKPLKLSENQNDEYDVDPFEAVTDWLPLNKNVDKT*",
|
21
|
+
"cds:MhA1_Contig125.frz3.gene4" =>
|
22
|
+
"MKSTKMSATEIVSYHLYSLHTLSSFCLTENPENIFIKDQNFQDFFLFCERVREQFNEAEELKTPLNTKISQTDSTNIQNKKDEPSISIGPCVNDLCPKGFECIENICFKSMEMPKTERVLSIGPCVNAKCPEGFSCYEDDRQCYAN*",
|
23
|
+
"cds:MhA1_Contig125.frz3.gene5" =>
|
24
|
+
"MRLDIFLVIAFSLGVAVNCGVVKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPSGGETGGAAAPAEAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAEAAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPAAGGCTEGCAAGGESAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAESAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPEQAAPAVAQPAPAAAPSQASGY*",
|
25
|
+
"cds:MhA1_Contig125.frz3.gene6" =>
|
26
|
+
"MDNAENEEKNDKENHEKPIKFEINNQKQFFKKEDEVKECEEESKNVNEFEEDQGTNEVSEVLRLFRRGNVWAFALQNLDLMRAYVILSCLAIAVVMLANFLRNSRFFDFCLK*",
|
27
|
+
"cds:MhA1_Contig125.frz3.gene7" =>
|
28
|
+
"MLFCLLHPMDHNTGPLARKSSTLCSLLLLSIAALLVLAVPGQANSEEVGFGNHTKEKDGDEVTVNIDSVQAPDDLTYAVYEKRFKDVCEFVITKDDIELLYKGKGCTVELLTGENQDITFKTGVKDIGCVRNDCDKASLYSSVGEVEPGLSQSVTDGKTEFELRISGSEFNMNFEEDAPFNPQKNRCAPKQDHIVKPETWRIKNGELKDKHLLVFHLLPKTATREYTKEGKISKEQPPEEAPKCKLFIRFKRPYYEFLYVGPITTTVTTTTTTTTTPSSGLVGQGPTPKTGTHQGNTPKVQGKGSEKESDNTMMIVIIVIVVVVVVLVIGVVLIFILKNKGSKEDELQKVKQTTTKANKSSAVTL*",
|
29
|
+
"cds:MhA1_Contig125.frz3.gene8" =>
|
30
|
+
"MRQRESVILNKTENQTQIFEKLLNLYNSPKDVVNLRNNPEQLIQLGIDSKQFSAILEMMFGARRRNSLRGDYREARRFRNRREYSAWWDAGEVNNWRINSRHPSRHGTVEYWRCAFAVGRFFTCPSRIRITFGFGDRYVIVANARNHPHNHNRQNNAGDNNPNTVRRALPMEANERLTARTVHIGPRPSTSAPNQPTTKGQAAPPRASVSTTSANAAATPTTSASSTVQKGTAAPSTSAAPSTSAAPSTSAASRPLKPPGFATAATSATNSQQAAAKPASNQQPAPTATTSQTSASAPGTSSKPPPTTSPAPAATPAPATSQPGTSTVKSAPASTPTPLKPPAATEKQTSQPPSAAPGTTATIKPVLVTNIPGLPPGIPTSATGSGTINVSLTALDAFLAGHPRPASTSAPASSQTTPGPASQPSSAPVTQNKGKEEKKEDKKEEKKEGKKEEKKEGKKEEKRGPG*",
|
31
|
+
"cds:MhA1_Contig125.frz3.gene9" =>
|
32
|
+
"MHGATIGNRLRATRRSRDAQMMAAAESVARLSRRHSHQKAIRRVLPPPPALNSSRDSQPINPFCSDPSSIQPVIAKGVCVRSVGVFKSALPPSTPFPSTSTAPNIPSDNTFVPHLNNSTPLHNNHHRTLGGSENCLNYQQQYIGGSYSARSQQQHPPPPAPSSCCISPFKPLEILGNSNGTTDSSSGGCNSARAAMHRQFTGSSNGEEEFTVEELQEFAQAFKMFDKDGNGTMSIKELGVAMRTLGLNPTEDELLNMVNEYDVDGNGIDFSEFCKMMKEMNKETDQELIRLAFRVFDKDGNGYITAQEFRHFMTIDYEEFVNAVAPIVNDGAKEDAPFFEKEQPTSFGQPITSGPPLASGKAKHF*",
|
33
|
+
"cds:MhA1_Contig125.frz3.gene21" =>
|
34
|
+
"MDVKPPPSAPQDIKEAIKESNMSTWRPFLIGNRMRTTSEDSAESFDAYDKSFDAYDVGNKKERRLSITEQFFGSSMPGRLRSNSTTEYEGHEHEPTFKKVDLKQFMKHQRKILGDDEWQ*",
|
35
|
+
"cds:MhA1_Contig125.frz3.gene22" =>
|
36
|
+
"MAKALISGFVSSGFISKSNISICTRSEATAKSWRLQGFTSAYSKDVFYSEVKKPRAIILIAVKPQIFPSFINEVKANEWFYFGVPGILCISIMSGISLQHFDKEMKSVGFDGHSMRLMPNVNCAVSTGTLVLSADPETPQELVTLVSVLSSYVGKCIRVDEAHFNAASSISGCGPAFIALVIEALADGGVVAGLSRELANQLAADTVKGTGHLFMTKMASVSPTSDNPSPAQLKDQVCSPAGTTIEGVRELEKHGVRSAFIEAIQASTRRAFELSQ*" }
|
37
|
+
|
38
|
+
describe GFFdb, "Assemble CDS (Contig125)" do
|
39
|
+
before :all do
|
40
|
+
# gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2,:cache_components => :cache_none, :cache_records => :cache_none)
|
41
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2)
|
42
|
+
@gff = gffdb.assembler
|
43
|
+
@gff.parse
|
44
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig125"]
|
45
|
+
@componentlist = {}
|
46
|
+
@cdslist = {}
|
47
|
+
@gff.each_CDS do | id, reclist, component |
|
48
|
+
@componentlist[id] = component
|
49
|
+
@cdslist[id] = reclist
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should have the single contig" do
|
54
|
+
@gff.sequencelist.size.should == 1
|
55
|
+
@gff.sequencelist["MhA1_Contig125"].should_not == nil
|
56
|
+
@gff.sequencelist["MhA1_Contig125"].size.should == 53702
|
57
|
+
end
|
58
|
+
PROTEINS.each do | name, seq |
|
59
|
+
it "should translate gene #{name}" do
|
60
|
+
recs = @cdslist[name]
|
61
|
+
component = @componentlist[name]
|
62
|
+
cds0 = recs[0]
|
63
|
+
cds0.seqname.should == 'MhA1_Contig125'
|
64
|
+
# ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
65
|
+
# p [name, ntseq]
|
66
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
67
|
+
aaseq.should == seq
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble3_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
GFF3FILE3="test/data/gff/test-cds.gff3"
|
14
|
+
|
15
|
+
describe GFFdb, "Assemble CDS (extra checks)" do
|
16
|
+
before :all do
|
17
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE3)
|
18
|
+
@gff = gffdb.assembler
|
19
|
+
@gff.parse
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should translate gene MhA1_Contig1040.frz3.gene29" do
|
23
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig1040"]
|
24
|
+
@componentlist = {}
|
25
|
+
@cdslist = {}
|
26
|
+
@gff.each_CDS do | id, reclist, component |
|
27
|
+
@componentlist[id] = component
|
28
|
+
@cdslist[id] = reclist
|
29
|
+
end
|
30
|
+
name = "cds:MhA1_Contig1040.frz3.gene"
|
31
|
+
recs = @cdslist[name]
|
32
|
+
component = @componentlist[name]
|
33
|
+
p recs
|
34
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
35
|
+
ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
|
36
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
37
|
+
ntseq.should == "TTATCAACAAATTTAATATTTTCTAATAATACTAATAATATGGCAATTTCAATTCACCCAAAAAAGAATTCTAATGAGGATATTCCCCCATCAACATTATTAACATATCGTTGGTTTCTCTCTTACCGTATGATGACTGCAAGCATGTTATGCCTTTGTTTTTCTAGGCAAATTAATTAA"
|
38
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
39
|
+
aaseq.should == "LSTNLIFSNNTNNMAISIHPKKNSNEDIPPSTLLTYRWFLSYRMMTASMLCLCFSRQIN*"
|
40
|
+
end
|
41
|
+
it "should translate gene MhA1_Contig2992.frz3.gene1" do
|
42
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig2992"]
|
43
|
+
@componentlist = {}
|
44
|
+
@cdslist = {}
|
45
|
+
@gff.each_CDS do | id, reclist, component |
|
46
|
+
@componentlist[id] = component
|
47
|
+
@cdslist[id] = reclist
|
48
|
+
end
|
49
|
+
name = "cds:MhA1_Contig2992.frz3.gene1"
|
50
|
+
recs = @cdslist[name]
|
51
|
+
component = @componentlist[name]
|
52
|
+
# ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
|
53
|
+
# ntseq.should == ""
|
54
|
+
ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
|
55
|
+
ntseq.should == "AAAATTAATAAAAAAATAAATGATAATTCTTTTAATATTCAATCTGATTCGAATGAAAATTTGTTTAATGATGGAATTAATTCTGAACAAAATGAAGACAATATAGCAACAAAAAAAGGCAACAAAAAATTCGGTAAAAATCAAAAAGAAGGAAATAAAGAGTTGGATATTCAAAGTGAAGGTTTTGATAATAATGAAATACCTTCAAAAGAAAGCAAAAAACAAATAAGTAATTTTGGGGATAATGAAAGTGAATATGAAAAAGAAGAGGATAATAGAAAAAAGAAAGGGAAAAAAGGAATGATAGAAAAGTATGAATTAGGAAGGAATAAAGGAAGGGATAAAAATGAAAGAAATAAGGCTTCTGAAAGGTTTGATGAGCAGAATCAAGACAGAAATAATCAACGTGATAGTTTTGATTCTGGCAATAATGATAAATCACAAAGAGGCTTAGATAGCGGCACATTAGATGGAACAAATAATTTAAAAAGATCGAATGATGATCAATTACCAGAATTTTTGAAAACGGCCAGTCTCTCAGAGCGTCAGAAATTTCTTCAACTTGAAGCAGAAAATGACAGGTCCAAGTCTTCTATACGAAGAGATAAACAGAATTGGGCTGATCAACAAGGGCAGAGAATTTCTGATCTTTATAAACAATTTCAACAATCTTTACAACAAAAAGAAAAACAATTTAAAAGTGAACGTCAACGAAATGTTCAAATTAAATTAAGCAGAAATGCACAGAATGTTGATAAAAGAATTCAGGATCTTCTGAATAATCCTGATATTGCTGAAAGAGCTTTAATTCTTCAAATTGAACAAATCCTCGGCGGTACAGACGATAGTATTCGTCAGGAATTACAAAGACAAATATCTGTTATTGGACCATTAGATGGAAATATACCGCCAAATCTTACATAG"
|
56
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
57
|
+
aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
|
@@ -0,0 +1,291 @@
|
|
1
|
+
# RSpec for BioRuby-GFF3-Plugin. Run with something like:
|
2
|
+
#
|
3
|
+
# ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble_spec.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
$: << "../lib"
|
8
|
+
|
9
|
+
require 'bio/db/gff/gffdb'
|
10
|
+
|
11
|
+
include Bio::GFFbrowser
|
12
|
+
|
13
|
+
FASTAFILE="test/data/gff/MhA1_Contig1133.fa"
|
14
|
+
GFF3FILE="test/data/gff/MhA1_Contig1133.gff3"
|
15
|
+
|
16
|
+
describe GFFdb, "Assemble CDS" do
|
17
|
+
before :all do
|
18
|
+
gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE, :fasta_filename => FASTAFILE)
|
19
|
+
@gff = gffdb.assembler
|
20
|
+
@gff.parse
|
21
|
+
@contigsequence = @gff.sequencelist["MhA1_Contig1133"]
|
22
|
+
@componentlist = {}
|
23
|
+
@cdslist = {}
|
24
|
+
@gff.each_CDS do | id, reclist, component |
|
25
|
+
@componentlist[id] = component
|
26
|
+
@cdslist[id] = reclist
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should have the single contig" do
|
31
|
+
@gff.sequencelist.size.should == 1
|
32
|
+
@gff.sequencelist["MhA1_Contig1133"].should_not == nil
|
33
|
+
@gff.sequencelist["MhA1_Contig1133"].size.should == 33905
|
34
|
+
end
|
35
|
+
it "should have a container component" do
|
36
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
37
|
+
component.start.should == 7838
|
38
|
+
component.end.should == 8740
|
39
|
+
end
|
40
|
+
it "should have CDS 7838:7980" do
|
41
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
42
|
+
cds0 = recs[0]
|
43
|
+
cds0.start.should == 7838
|
44
|
+
cds0.end.should == 7980
|
45
|
+
cds0.frame.should == 0
|
46
|
+
cds0.seqname.should == 'MhA1_Contig1133'
|
47
|
+
end
|
48
|
+
it "should have CDS 8065:8308" do
|
49
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
50
|
+
cds1 = recs[1]
|
51
|
+
cds1.start.should == 8065
|
52
|
+
cds1.end.should == 8308
|
53
|
+
cds1.frame.should == 1
|
54
|
+
cds1.strand.should == '+'
|
55
|
+
cds1.seqname.should == 'MhA1_Contig1133'
|
56
|
+
end
|
57
|
+
# From Wormbase website http://www.wormbase.org/db/gb2/gbrowse/m_hapla/?name=MhA1_Contig1133%3A7838..8740
|
58
|
+
# >MhA1_Contig1133:7838..8740
|
59
|
+
# atgcgtcctttaacagatgaagaaactgaaaagtttttcaaaaaactttcaaattatatt
|
60
|
+
# ggtgacaatattaaacttttattggaaagagaagatggagaatatgtttttcgtttacat
|
61
|
+
# aaagacagagtttattattgcaggtttttttaaaattattttatatttaaattaggtctc
|
62
|
+
# aatctttataggggattttgtttttgttatttttttttggtttttag>tgaaaaattaatg
|
63
|
+
# cgacaagcagcatgtattggacgtaaacaattgggatcttttggaacttgtttgggtaaa
|
64
|
+
# ttcacaaaaggagggtctttctttcttcatataacatcattggattatttggcaccttat
|
65
|
+
# gctttagcaaaaatttggttaaaaccacaagctgaacaacaatttttatatggaaataat
|
66
|
+
# attgttaaatctggtgttggaagaatgagtgaagggattgaagaaaaacaagtaaatatt
|
67
|
+
# taattattttttttaaaatggattcctttacttctcaattaaatattaaaagcatatctg
|
68
|
+
# tagaagaggttatttatctttaaatcgaaatatacaggaataaataaaaatttaagaaat
|
69
|
+
# cataatttagaattctttttctggttatgttagattatttttaaatttttttgtaatttt
|
70
|
+
# tttttcgtaatttttttatgagcaaatcccttctctcttaaatattttaataaaaatcta
|
71
|
+
# attttataaattataattattttttagggtattattatttataatatgtcagatttacca
|
72
|
+
# ttgggttttggagtggctgcaaagggaacattatcttgtagaaaagtagatcctacagct
|
73
|
+
# ttagttgttttacatcaatcagatttgggtgaatatattcgaaatgaagagggattaatt
|
74
|
+
|
75
|
+
it "should translate CDS 7838:7980 (in frame 0, + strand)" do
|
76
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
77
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
78
|
+
cds0 = recs[0]
|
79
|
+
cds0.seqname.should == 'MhA1_Contig1133'
|
80
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds0])
|
81
|
+
seq.size.should == 143
|
82
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
|
83
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
|
84
|
+
aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYC"
|
85
|
+
end
|
86
|
+
it "should translate CDS 8065:8308 (in frame 1, + strand)" do
|
87
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
88
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
89
|
+
cds1 = recs[1]
|
90
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1], :phase => false)
|
91
|
+
seq.size.should == 244
|
92
|
+
seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
93
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1])
|
94
|
+
seq.should == "GAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
|
95
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
96
|
+
# note it should handle the frame shift and direction!
|
97
|
+
aaseq.should == "EKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQ"
|
98
|
+
end
|
99
|
+
it "should translate CDS3 (in frame 0, + strand)" do
|
100
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
101
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
102
|
+
cds3 = recs[2]
|
103
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds3], :phase => false)
|
104
|
+
seq.size.should == 156
|
105
|
+
seq.should == "GGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
106
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds3])
|
107
|
+
# note it should handle the frame shift and direction!
|
108
|
+
aaseq.should == "GIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
|
109
|
+
end
|
110
|
+
it "should assemble 3 CDSs for MhA1_Contig1133.frz3.gene4" do
|
111
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
|
112
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
|
113
|
+
seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>false)
|
114
|
+
seq.size.should == 543
|
115
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
116
|
+
seq = @gff.assemble(@contigsequence,component.start,recs)
|
117
|
+
seq.size.should == 543
|
118
|
+
seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
|
119
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
120
|
+
aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYCSEKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQGIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
|
121
|
+
end
|
122
|
+
# > class=Sequence position=MhA1_Contig1133:27463..29904 (- strand); shown in frame 1!
|
123
|
+
# ATGGACCATC ATGCATTGGT GGAGGAATTA CCAGAAATTG AAAAATTAAC TCCTCAAGAA CGTATTGCAT TAGCTAGAGA
|
124
|
+
# ACGCCGTGCT GAACAACTTC GACAGAATGC TGCACGGGAG GCTCAATTGC CAATGCCTGC ACAGCGCCGG CCTCGTCTTC
|
125
|
+
# GATTTACACC AGATGTTGCT TTACTTGAGG CAACAGTTAG GGGTGATACC CAAGAAGGTT ATACATAAAG ATTATTGATT
|
126
|
+
# TTAAATGAAT TTATTTATTT TTTAGTTGAA AGACTTTTAA TGGAAGGTGT CAATGCTGAT TCACATAATG AGGATGGATT
|
127
|
+
# AACACCTTTA CATCAGGCAA AAACCAAATT AATTTTTTTA AATTTATTTT TAGTGTGCCA TTGACAATAA TGAAAGAATT
|
128
|
+
# GTTCGTCTTC TGCTTAGGTA CGGAGCTTGT GTTAATGCCA AAGACACTGA ACTTTGGACA CCATTGCACG CAGCTGCATG
|
129
|
+
# TTGTGCTTAT ATTGATATTG TTCGATTGCT TATTGCACAG TTAGTTTTTT TTTAATTTTT TTTTTAAATA AATTTCTTAA
|
130
|
+
# GTTTTACAGA AATATTTATT TTAAACAAAC GGGACTTCCT TTTAAATTTT TTGTATTTTT AATCTTTACG TATTTTCATT
|
131
|
+
# TAATAATTAA TTCGTCTTCT AAAAGTTCGT AAGTTTTGTG GTTTAGTTTA ATGGGTAAAC ATCCAGTTTT TAGGTCATCG
|
132
|
+
# ATTTTTATTT TTGCGTCATA TTTTATCGAA AACTTCTTTC ATATTAAAAA TTTCTTTTTA AGCAACGCAG ATTTACTAGC
|
133
|
+
# AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG
|
134
|
+
# CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA
|
135
|
+
# CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT
|
136
|
+
# TTATGTAAAG ATTATTATGA AAGGATTATT ACAGTTTTAT TCCTTTTTAG TTACATATAG CAGCAGCTAA TGGTTATTAT
|
137
|
+
# GATGTTGCTG CTTTCCTTCT TCGTTGTAAT GTTTCTCCAG CATTGAGAGA TATAGATTTG TGGCAACCAA TTCATGCAGC
|
138
|
+
# TGCTTCTTGG AATCAACCAG ACTTAATCGA GCTTTTATGC GAATATGGGG CTGATATAAA TGCAAAAACT GGAGCTGGGG
|
139
|
+
# AAAGCCCTTT AGGTTTATTT TATTGAATCT TATAATTTAT AAATATTTGC TATTAAGTAT GAGGGGAGAG GAACTAACAA
|
140
|
+
# TAAGGAATTA AATTTCTCAA TATCAGGATT TTTCGGTTCA CACCCATTTT CTTAAGACCT TTAATTTTTC TCAAAATATG
|
141
|
+
# TATGTGACCA CGTCGGGAGG CTTTTTTATT TTTACATGGC TATTTTAAGA AAGGCTAGAA TTTTGACATA CTTTTAACTT
|
142
|
+
# ATCGCCTTCC TAACTATTTT CTGTCTATAT ATTTTTTTAA ATTAAGAATT AACTGAAGAT GAACCAACCC AACAAGTAAT
|
143
|
+
# TAGAACAATC GCTCAGACAG AAGCAAGGAG ACGGCGTGGT CCAGGTGGTG GTTACTTTGG TGTTCGTGAT TCTCGACGAC
|
144
|
+
# AAAGCCGAAA GTAATTTTAA ATTTATATTT TCTTTTCATC TTTTTATCTA GAAGAAAAAA GTTTGAATCT CCTCAACAAC
|
145
|
+
# CACCTTCAAC ATTAGAAAAT CCTTTCTCAG CTAGAGGTGC AATTAGACGA CAATCATTGC GAGATCGTAG TGGAATGTCA
|
146
|
+
# TTAGCTCGTT TGGAAGCACA AAGAGAGGGT TCTGACCTTA TTAGAAGTTA TAATAGTAAA GAAGACCTTT CTTCTAATAC
|
147
|
+
# AGCGGTTTGT TTTTTAAAAT TGTAATTTTT TCTTAATTTT TAGGATGATT CTTTAAATGT TGGAAGTTCT TCATATCTCA
|
148
|
+
# ACAATCCAAC AGCCTCGGCT AGTGCTTCCT CTTCAGCATT ACACGGAACT CCACATCAAC AACAACGTCG TGAATCTCCA
|
149
|
+
# CCTAAACGTG CATTAATGGC TAGAAGTGCT TCTCATCAAA AACAAAAACA ACAAATGTCT CCAGATGAAT GGCTGAAAAA
|
150
|
+
# ATTAGAAGCA GATTCTGCAG GTTTTCGAGA TAATGATGGA GAAGATGGTG AATTACAATC TGAACTTAAA GGAGGACAAA
|
151
|
+
# GAATGAAGAG TGGTGGTGGT GGAGGAGCGA GAGGTCAGCA AGGTGAATTA AAATATTTTT TTTGAATTTT ATATTTATTT
|
152
|
+
# TTCGTTTAAT AGAAATGAAT GGTGGTCCAA CAGCAACATT TGGTGGAGCT TCAAAACAAC AATTAGCAAT GGGCTCTGGA
|
153
|
+
# CCCAATAGAC GGCGCAAACA AGGATGTTGC TCTGTTTTGT GA
|
154
|
+
it "should assemble a reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
155
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
156
|
+
recs.size.should == 8
|
157
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
158
|
+
# 193 bp from MhA1_Contig1133:27,981..28,173
|
159
|
+
# >MhA1_Contig1133:27981..28173
|
160
|
+
# cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
|
161
|
+
# tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
|
162
|
+
# aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
|
163
|
+
# aaacttttttctt
|
164
|
+
cds1 = recs[5]
|
165
|
+
cds1.start.should == 27981
|
166
|
+
cds1.frame.should == 1
|
167
|
+
cds1.strand.should == '-'
|
168
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true)
|
169
|
+
seq.should == "TCTTTTTTCAAACTTAGAGGAGTTGTTGGTGGAAGTTGTAATCTTTTAGGAAAGAGTCGATCTCCACGTTAATCTGCTGTTAGTAACGCTCTAGCATCACCTTACAGTAATCGAGCAAACCTTCGTGTTTCTCTCCCAAGACTGGAATAATCTTCAATATTATCATTTCTTCTGGAAAGAAGATTATGTCGC"
|
170
|
+
seq.size.should == 192
|
171
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true,:complement=>true)
|
172
|
+
seq.should == "AGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCG"
|
173
|
+
seq.size.should == 192
|
174
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
175
|
+
# note it should handle the frame shift and direction!
|
176
|
+
# >EMBOSS_001_4
|
177
|
+
# RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
|
178
|
+
aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
|
179
|
+
end
|
180
|
+
it "should take the 6th CDS in MhA1_Contig1133.frz3.gene11 (which is 3rd on DNA)" do
|
181
|
+
# >MhA1_Contig1133:27981..28173
|
182
|
+
# cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
|
183
|
+
# tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
|
184
|
+
# aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
|
185
|
+
# aaacttttttctt
|
186
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
187
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
188
|
+
cds2 = recs[5].clone
|
189
|
+
# p cds2
|
190
|
+
cds2.start.should == 27981
|
191
|
+
cds2.frame.should == 1
|
192
|
+
cds2.strand.should == '-'
|
193
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2],:complement=>true)
|
194
|
+
seq.should == "GCGACATAATCTTCTTTCCAGAAGAAATGATAATATTGAAGATTATTCCAGTCTTGGGAGAGAAACACGAAGGTTTGCTCGATTACTGTAAGGTGATGCTAGAGCGTTACTAACAGCAGATTAACGTGGAGATCGACTCTTTCCTAAAAGATTACAACTTCCACCAACAACTCCTCTAAGTTTGAAAAAAGAA"
|
195
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
|
196
|
+
# note it should handle the frame shift and direction!
|
197
|
+
# >27981..28173_4 RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
|
198
|
+
aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
|
199
|
+
end
|
200
|
+
it "should assemble the 1st reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
201
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
202
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
203
|
+
cds1 = recs[0].clone
|
204
|
+
cds1.start.should == 29710
|
205
|
+
cds1.frame.should == 0
|
206
|
+
cds1.strand.should == '-'
|
207
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:raw=>true)
|
208
|
+
seq.size.should == 195
|
209
|
+
seq.should == "TGTTGCCTCAAGTAAAGCAACATCTGGTGTAAATCGAAGACGAGGCCGGCGCTGTGCAGGCATTGGCAATTGAGCCTCCCGTGCAGCATTCTGTCGAAGTTGTTCAGCACGGCGTTCTCTAGCTAATGCAATACGTTCTTGAGGAGTTAATTTTTCAATTTCTGGTAATTCCTCCACCAATGCATGATGGTCCAT"
|
210
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds1],:codonize=>true)
|
211
|
+
seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACA"
|
212
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
|
213
|
+
aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEAT"
|
214
|
+
end
|
215
|
+
it "should assemble the 3rd reverse CDS in MhA1_Contig1133.frz3.gene11" do
|
216
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
217
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
218
|
+
cds2 = recs[2].clone
|
219
|
+
# CAACGCAG ATTTACTAGC AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT TTAT
|
220
|
+
# p cds2
|
221
|
+
cds2.frame.should == 1
|
222
|
+
cds2.strand.should == '-'
|
223
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2], :raw=>true)
|
224
|
+
seq.should == "ATAAATTTCCCTTTCTCCAGAAAAACTTACAAAAGTAGATTTATCAACAGAATTTCTTTGATCTAAAGGTAATCCTCTTTGATGTAAAATTTTCATATCATTTAACATTTCCCTTTCTGGTTGTTGTCTTCTTTCATCAATCATTTCTTGTGTAATTCCTCTAGCAGCCATTTCAGATTCAATAAGGTCAAGGGTTTGTTCATCATCACAAATATCATAAGGCATATTACCATCTGCATTTACTGCTAGTAAATCTGCGTTG"
|
225
|
+
seq = @gff.assemble(@contigsequence,component.start,[cds2], :codonize=>true)
|
226
|
+
seq.should == "AACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTAT"
|
227
|
+
# cds1.frame = 1
|
228
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
|
229
|
+
# note it should handle the frame shift and direction!
|
230
|
+
aaseq.should == "NADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIY"
|
231
|
+
end
|
232
|
+
it "should assemble the protein sequence for MhA1_Contig1133.frz3.gene11" do
|
233
|
+
recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
|
234
|
+
component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
|
235
|
+
seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>true, :reverse=>true, :complement=>true)
|
236
|
+
seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACATGTGCCATTGACAATAATGAAAGAATTGTTCGTCTTCTGCTTAGGTACGGAGCTTGTGTTAATGCCAAAGACACTGAACTTTGGACACCATTGCACGCAGCTGCATGTTGTGCTTATATTGATATTGTTCGATTGCTTATTGCACACAACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTATTTACATATAGCAGCAGCTAATGGTTATTATGATGTTGCTGCTTTCCTTCTTCGTTGTAATGTTTCTCCAGCATTGAGAGATATAGATTTGTGGCAACCAATTCATGCAGCTGCTTCTTGGAATCAACCAGACTTAATCGAGCTTTTATGCGAATATGGGGCTGATATAAATGCAAAAACTGGAGCTGGGGAAAGCCCTTTAGAATTAACTGAAGATGAACCAACCCAACAAGTAATTAGAACAATCGCTCAGACAGAAGCAAGGAGACGGCGTGGTCCAGGTGGTGGTTACTTTGGTGTTCGTGATTCTCGACGACAAAGCCGAAAAAGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCGGATGATTCTTTAAATGTTGGAAGTTCTTCATATCTCAACAATCCAACAGCCTCGGCTAGTGCTTCCTCTTCAGCATTACACGGAACTCCACATCAACAACAACGTCGTGAATCTCCACCTAAACGTGCATTAATGGCTAGAAGTGCTTCTCATCAAAAACAAAAACAACAAATGTCTCCAGATGAATGGCTGAAAAAATTAGAAGCAGATTCTGCAGGTTTTCGAGATAATGATGGAGAAGATGGTGAATTACAATCTGAACTTAAAGGAGGACAAAGAATGAAGAGTGGTGGTGGTGGAGGAGCGAGAGGTCAGCAAGAAATGAATGGTGGTCCAACAGCAACATTTGGTGGAGCTTCAAAACAACAATTAGCAATGGGCTCTGGACCCAATAGACGGCGCAAACAAGGATGTTGCTCTGTTTTGTGA"
|
237
|
+
aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
|
238
|
+
aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAAASWNQPDLIELLCEYGADINAKTGAGESPLELTEDEPTQQVIRTIAQTEARRRRGPGGGYFGVRDSRRQSRKRKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTADDSLNVGSSSYLNNPTASASASSSALHGTPHQQQRRESPPKRALMARSASHQKQKQQMSPDEWLKKLEADSAGFRDNDGEDGELQSELKGGQRMKSGGGGGARGQQEMNGGPTATFGGASKQQLAMGSGPNRRRKQGCCSVL*"
|
239
|
+
# >EMBOSS_001_1
|
240
|
+
# MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVA
|
241
|
+
# LLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLA
|
242
|
+
# VNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPL
|
243
|
+
# DQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAA...
|
244
|
+
|
245
|
+
end
|
246
|
+
it "should assemble exons into"
|
247
|
+
it "should assemble the gene into"
|
248
|
+
# >MhA1_Contig1133:27463..29904
|
249
|
+
# tcacaaaacagagcaacatccttgtttgcgccgtctattgggtccagagcccattgctaa
|
250
|
+
# ttgttgttttgaagctccaccaaatgttgctgttggaccaccattcatttctattaaacg
|
251
|
+
# aaaaataaatataaaattcaaaaaaaatattttaattcaccttgctgacctctcgctcct
|
252
|
+
# ccaccaccaccactcttcattctttgtcctcctttaagttcagattgtaattcaccatct
|
253
|
+
# tctccatcattatctcgaaaacctgcagaatctgcttctaattttttcagccattcatct
|
254
|
+
# ggagacatttgttgtttttgtttttgatgagaagcacttctagccattaatgcacgttta
|
255
|
+
# ggtggagattcacgacgttgttgttgatgtggagttccgtgtaatgctgaagaggaagca
|
256
|
+
# ctagccgaggctgttggattgttgagatatgaagaacttccaacatttaaagaatcatcc
|
257
|
+
# taaaaattaagaaaaaattacaattttaaaaaacaaaccgctgtattagaagaaaggtct
|
258
|
+
# tctttactattataacttctaataaggtcagaaccctctctttgtgcttccaaacgagct
|
259
|
+
# aatgacattccactacgatctcgcaatgattgtcgtctaattgcacctctagctgagaaa
|
260
|
+
# ggattttctaatgttgaaggtggttgttgaggagattcaaacttttttcttctagataaa
|
261
|
+
# aagatgaaaagaaaatataaatttaaaattactttcggctttgtcgtcgagaatcacgaa
|
262
|
+
# caccaaagtaaccaccacctggaccacgccgtctccttgcttctgtctgagcgattgttc
|
263
|
+
# taattacttgttgggttggttcatcttcagttaattcttaatttaaaaaaatatatagac
|
264
|
+
# agaaaatagttaggaaggcgataagttaaaagtatgtcaaaattctagcctttcttaaaa
|
265
|
+
# tagccatgtaaaaataaaaaagcctcccgacgtggtcacatacatattttgagaaaaatt
|
266
|
+
# aaaggtcttaagaaaatgggtgtgaaccgaaaaatcctgatattgagaaatttaattcct
|
267
|
+
# tattgttagttcctctcccctcatacttaatagcaaatatttataaattataagattcaa
|
268
|
+
# taaaataaacctaaagggctttccccagctccagtttttgcatttatatcagccccatat
|
269
|
+
# tcgcataaaagctcgattaagtctggttgattccaagaagcagctgcatgaattggttgc
|
270
|
+
# cacaaatctatatctctcaatgctggagaaacattacaacgaagaaggaaagcagcaaca
|
271
|
+
# tcataataaccattagctgctgctatatgtaactaaaaaggaataaaactgtaataatcc
|
272
|
+
# tttcataataatctttacataaatttccctttctccagaaaaacttacaaaagtagattt
|
273
|
+
# atcaacagaatttctttgatctaaaggtaatcctctttgatgtaaaattttcatatcatt
|
274
|
+
# taacatttccctttctggttgttgtcttctttcatcaatcatttcttgtgtaattcctct
|
275
|
+
# agcagccatttcagattcaataaggtcaagggtttgttcatcatcacaaatatcataagg
|
276
|
+
# catattaccatctgcatttactgctagtaaatctgcgttgcttaaaaagaaatttttaat
|
277
|
+
# atgaaagaagttttcgataaaatatgacgcaaaaataaaaatcgatgacctaaaaactgg
|
278
|
+
# atgtttacccattaaactaaaccacaaaacttacgaacttttagaagacgaattaattat
|
279
|
+
# taaatgaaaatacgtaaagattaaaaatacaaaaaatttaaaaggaagtcccgtttgttt
|
280
|
+
# aaaataaatatttctgtaaaacttaagaaatttatttaaaaaaaaaattaaaaaaaaact
|
281
|
+
# aactgtgcaataagcaatcgaacaatatcaatataagcacaacatgcagctgcgtgcaat
|
282
|
+
# ggtgtccaaagttcagtgtctttggcattaacacaagctccgtacctaagcagaagacga
|
283
|
+
# acaattctttcattattgtcaatggcacactaaaaataaatttaaaaaaattaatttggt
|
284
|
+
# ttttgcctgatgtaaaggtgttaatccatcctcattatgtgaatcagcattgacaccttc
|
285
|
+
# cattaaaagtctttcaactaaaaaataaataaattcatttaaaatcaataatctttatgt
|
286
|
+
# ataaccttcttgggtatcacccctaactgttgcctcaagtaaagcaacatctggtgtaaa
|
287
|
+
# tcgaagacgaggccggcgctgtgcaggcattggcaattgagcctcccgtgcagcattctg
|
288
|
+
# tcgaagttgttcagcacggcgttctctagctaatgcaatacgttcttgaggagttaattt
|
289
|
+
# ttcaatttctggtaattcctccaccaatgcatgatggtccat
|
290
|
+
end
|
291
|
+
|