bio-gff3 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,268 @@
1
+ # LRU based Hash by Robert Klemme
2
+ #
3
+ # Copied from http://github.com/rklemme/muppet-laboratories
4
+ #
5
+ # License: unclear, not in repository, need to ask Permission
6
+ # Added by Pjotr Prins (pjotr.prins@thebird.nl)
7
+
8
+ require 'enumerator'
9
+
10
+ # Hash with LRU expiry policy. There are at most max_size elements in a
11
+ # LruHash. When adding more elements old elements are removed according
12
+ # to LRU policy.
13
+ class LRUHash
14
+ include Enumerable
15
+
16
+ attr_reader :max_size
17
+ attr_accessor :default, :default_proc, :release_proc
18
+
19
+ def initialize(max_size, default_value = nil, &block)
20
+ @max_size = normalize_max(max_size)
21
+ @default = default_value
22
+ @default_proc = block
23
+
24
+ @h = {}
25
+ @head = Node.new
26
+ @tail = front(Node.new)
27
+ end
28
+
29
+ def each_pair
30
+ if block_given?
31
+ each_node do |n|
32
+ yield [n.key, n.value]
33
+ end
34
+ else
35
+ enum_for :each_pair
36
+ end
37
+ end
38
+
39
+ alias each each_pair
40
+
41
+ def each_key
42
+ if block_given?
43
+ each_node do |n|
44
+ yield n.key
45
+ end
46
+ else
47
+ enum_for :each_key
48
+ end
49
+ end
50
+
51
+ def each_value
52
+ if block_given?
53
+ each_node do |n|
54
+ yield n.value
55
+ end
56
+ else
57
+ enum_for :each_value
58
+ end
59
+ end
60
+
61
+ def size
62
+ @h.size
63
+ end
64
+
65
+ def empty?
66
+ @head.succ.equal? @tail
67
+ end
68
+
69
+ def fetch(key, &b)
70
+ n = @h[key]
71
+
72
+ if n
73
+ front(n).value
74
+ else
75
+ (b || FETCH)[key]
76
+ end
77
+ end
78
+
79
+ def [](key)
80
+ fetch(key) do |k|
81
+ @default_proc ? @default_proc[self, k] : default
82
+ end
83
+ end
84
+
85
+ def keys
86
+ @h.keys
87
+ end
88
+
89
+ def values
90
+ @h.map {|k,n| n.value}
91
+ end
92
+
93
+ def has_key?(key)
94
+ @h.has_key? key
95
+ end
96
+
97
+ alias key? has_key?
98
+ alias member? has_key?
99
+ alias include? has_key?
100
+
101
+ def has_value?(value)
102
+ each_pair do |k, v|
103
+ return true if value.eql? v
104
+ end
105
+
106
+ false
107
+ end
108
+
109
+ alias value? has_value?
110
+
111
+ def values_at(*key_list)
112
+ key_list.map {|k| self[k]}
113
+ end
114
+
115
+ def assoc(key)
116
+ n = @h[key]
117
+
118
+ if n
119
+ front(n)
120
+ [n.key, n.value]
121
+ end
122
+ end
123
+
124
+ def rassoc(value)
125
+ each_node do |n|
126
+ if value.eql? n.value
127
+ front(n)
128
+ return [n.key, n.value]
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def key(value)
135
+ pair = rassoc(value) and pair.first
136
+ end
137
+
138
+ def store(key, value)
139
+ # same optimization as in Hash
140
+ key = key.dup.freeze if String === key && !key.frozen?
141
+
142
+ n = @h[key]
143
+
144
+ unless n
145
+ if size == max_size
146
+ # reuse node to optimize memory usage
147
+ n = delete_oldest
148
+ n.key = key
149
+ n.value = value
150
+ else
151
+ n = Node.new key, value
152
+ end
153
+
154
+ @h[key] = n
155
+ end
156
+
157
+ front(n).value = value
158
+ end
159
+
160
+ alias []= store
161
+
162
+ def delete(key)
163
+ n = @h[key] and remove_node(n).value
164
+ end
165
+
166
+ def delete_if
167
+ each_node do |n|
168
+ remove_node n if yield n.key, n.value
169
+ end
170
+ end
171
+
172
+ def max_size=(limit)
173
+ limit = normalize_max(limit)
174
+
175
+ while size > limit
176
+ delete_oldest
177
+ end
178
+
179
+ @max_size = limit
180
+ end
181
+
182
+ def clear
183
+ until empty?
184
+ delete_oldest
185
+ end
186
+
187
+ self
188
+ end
189
+
190
+ def to_s
191
+ s = nil
192
+ each_pair {|k, v| (s ? (s << ', ') : s = '{') << k.to_s << '=>' << v.to_s}
193
+ s ? (s << '}') : '{}'
194
+ end
195
+
196
+ alias inspect to_s
197
+
198
+ FETCH = Proc.new {|k| raise KeyError, 'key not found'}
199
+
200
+ # A single node in the doubly linked LRU list of nodes
201
+ Node = Struct.new :key, :value, :pred, :succ do
202
+ def unlink
203
+ pred.succ = succ if pred
204
+ succ.pred = pred if succ
205
+ self.succ = self.pred = nil
206
+ self
207
+ end
208
+
209
+ def insert_after(node)
210
+ raise 'Cannot insert after self' if equal? node
211
+ return self if node.succ.equal? self
212
+
213
+ unlink
214
+
215
+ self.succ = node.succ
216
+ self.pred = node
217
+
218
+ node.succ.pred = self if node.succ
219
+ node.succ = self
220
+
221
+ self
222
+ end
223
+ end
224
+
225
+ private
226
+ # iterate nodes
227
+ def each_node
228
+ n = @head.succ
229
+
230
+ until n.equal? @tail
231
+ succ = n.succ
232
+ yield n
233
+ n = succ
234
+ end
235
+
236
+ self
237
+ end
238
+
239
+ # move node to front
240
+ def front(node)
241
+ node.insert_after(@head)
242
+ end
243
+
244
+ # remove the node and invoke release_proc
245
+ # if set
246
+ def remove_node(node)
247
+ n = @h.delete(node.key)
248
+ n.unlink
249
+ release_proc and release_proc[n.key, n.value]
250
+ n
251
+ end
252
+
253
+ # remove the oldest node returning the node
254
+ def delete_oldest
255
+ n = @tail.pred
256
+ raise "Cannot delete from empty hash" if @head.equal? n
257
+ remove_node n
258
+ end
259
+
260
+ # Normalize the argument in order to be usable as max_size
261
+ # criterion is that n.to_i must be an Integer and it must
262
+ # be larger than zero.
263
+ def normalize_max(n)
264
+ n = n.to_i
265
+ raise ArgumentError, 'Invalid max_size: %p' % n unless Integer === n && n > 0
266
+ n
267
+ end
268
+ end
@@ -0,0 +1,73 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble2_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ FASTAFILE2="test/data/gff/MhA1_Contig125.fa"
14
+ GFF3FILE2="test/data/gff/MhA1_Contig125.gff3"
15
+
16
+ PROTEINS = {
17
+ "cds:MhA1_Contig125.frz3.gene2" =>
18
+ "MNDLVNQFKSAALAVGQYLTPVLRESKFKETGVLTPEEFVAAGDHLVHLCPTWSWAKASDSNGQTTFLITKQSALVTQRCAQIMGYDEILKEKIIKDESAETGDEQNEWVDTHHFDFETNCAPKDFEEEENKVEDIKENNLNEEENCEEEEEGEPIDLDEYLSSGLLEEEDPARFVLQNKSLKETKDDSTSNNLLRTRRYDLHITYDKYYQVPRFWLVGYDENGSPLAVDKMKEDFSQEHADKTITLESHPHISGLTLATIHPCRHAPVMKRLIEQFQESGKELLVIDYLFVFLKFVQAVIPTVEYDYTRSIHF*",
19
+ "cds:MhA1_Contig125.frz3.gene3" =>
20
+ "MERRKVSNTDPFEAAEGMLRWNSDIIKDKEIKQFKGLKKPLKLSENQNDEYDVDPFEAVTDWLPLNKNVDKT*",
21
+ "cds:MhA1_Contig125.frz3.gene4" =>
22
+ "MKSTKMSATEIVSYHLYSLHTLSSFCLTENPENIFIKDQNFQDFFLFCERVREQFNEAEELKTPLNTKISQTDSTNIQNKKDEPSISIGPCVNDLCPKGFECIENICFKSMEMPKTERVLSIGPCVNAKCPEGFSCYEDDRQCYAN*",
23
+ "cds:MhA1_Contig125.frz3.gene5" =>
24
+ "MRLDIFLVIAFSLGVAVNCGVVKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPSGGETGGAAAPAEAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAEAAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPAAGGCTEGCAAGGESAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAESAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPEQAAPAVAQPAPAAAPSQASGY*",
25
+ "cds:MhA1_Contig125.frz3.gene6" =>
26
+ "MDNAENEEKNDKENHEKPIKFEINNQKQFFKKEDEVKECEEESKNVNEFEEDQGTNEVSEVLRLFRRGNVWAFALQNLDLMRAYVILSCLAIAVVMLANFLRNSRFFDFCLK*",
27
+ "cds:MhA1_Contig125.frz3.gene7" =>
28
+ "MLFCLLHPMDHNTGPLARKSSTLCSLLLLSIAALLVLAVPGQANSEEVGFGNHTKEKDGDEVTVNIDSVQAPDDLTYAVYEKRFKDVCEFVITKDDIELLYKGKGCTVELLTGENQDITFKTGVKDIGCVRNDCDKASLYSSVGEVEPGLSQSVTDGKTEFELRISGSEFNMNFEEDAPFNPQKNRCAPKQDHIVKPETWRIKNGELKDKHLLVFHLLPKTATREYTKEGKISKEQPPEEAPKCKLFIRFKRPYYEFLYVGPITTTVTTTTTTTTTPSSGLVGQGPTPKTGTHQGNTPKVQGKGSEKESDNTMMIVIIVIVVVVVVLVIGVVLIFILKNKGSKEDELQKVKQTTTKANKSSAVTL*",
29
+ "cds:MhA1_Contig125.frz3.gene8" =>
30
+ "MRQRESVILNKTENQTQIFEKLLNLYNSPKDVVNLRNNPEQLIQLGIDSKQFSAILEMMFGARRRNSLRGDYREARRFRNRREYSAWWDAGEVNNWRINSRHPSRHGTVEYWRCAFAVGRFFTCPSRIRITFGFGDRYVIVANARNHPHNHNRQNNAGDNNPNTVRRALPMEANERLTARTVHIGPRPSTSAPNQPTTKGQAAPPRASVSTTSANAAATPTTSASSTVQKGTAAPSTSAAPSTSAAPSTSAASRPLKPPGFATAATSATNSQQAAAKPASNQQPAPTATTSQTSASAPGTSSKPPPTTSPAPAATPAPATSQPGTSTVKSAPASTPTPLKPPAATEKQTSQPPSAAPGTTATIKPVLVTNIPGLPPGIPTSATGSGTINVSLTALDAFLAGHPRPASTSAPASSQTTPGPASQPSSAPVTQNKGKEEKKEDKKEEKKEGKKEEKKEGKKEEKRGPG*",
31
+ "cds:MhA1_Contig125.frz3.gene9" =>
32
+ "MHGATIGNRLRATRRSRDAQMMAAAESVARLSRRHSHQKAIRRVLPPPPALNSSRDSQPINPFCSDPSSIQPVIAKGVCVRSVGVFKSALPPSTPFPSTSTAPNIPSDNTFVPHLNNSTPLHNNHHRTLGGSENCLNYQQQYIGGSYSARSQQQHPPPPAPSSCCISPFKPLEILGNSNGTTDSSSGGCNSARAAMHRQFTGSSNGEEEFTVEELQEFAQAFKMFDKDGNGTMSIKELGVAMRTLGLNPTEDELLNMVNEYDVDGNGIDFSEFCKMMKEMNKETDQELIRLAFRVFDKDGNGYITAQEFRHFMTIDYEEFVNAVAPIVNDGAKEDAPFFEKEQPTSFGQPITSGPPLASGKAKHF*",
33
+ "cds:MhA1_Contig125.frz3.gene21" =>
34
+ "MDVKPPPSAPQDIKEAIKESNMSTWRPFLIGNRMRTTSEDSAESFDAYDKSFDAYDVGNKKERRLSITEQFFGSSMPGRLRSNSTTEYEGHEHEPTFKKVDLKQFMKHQRKILGDDEWQ*",
35
+ "cds:MhA1_Contig125.frz3.gene22" =>
36
+ "MAKALISGFVSSGFISKSNISICTRSEATAKSWRLQGFTSAYSKDVFYSEVKKPRAIILIAVKPQIFPSFINEVKANEWFYFGVPGILCISIMSGISLQHFDKEMKSVGFDGHSMRLMPNVNCAVSTGTLVLSADPETPQELVTLVSVLSSYVGKCIRVDEAHFNAASSISGCGPAFIALVIEALADGGVVAGLSRELANQLAADTVKGTGHLFMTKMASVSPTSDNPSPAQLKDQVCSPAGTTIEGVRELEKHGVRSAFIEAIQASTRRAFELSQ*" }
37
+
38
+ describe GFFdb, "Assemble CDS (Contig125)" do
39
+ before :all do
40
+ # gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2,:cache_components => :cache_none, :cache_records => :cache_none)
41
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2)
42
+ @gff = gffdb.assembler
43
+ @gff.parse
44
+ @contigsequence = @gff.sequencelist["MhA1_Contig125"]
45
+ @componentlist = {}
46
+ @cdslist = {}
47
+ @gff.each_CDS do | id, reclist, component |
48
+ @componentlist[id] = component
49
+ @cdslist[id] = reclist
50
+ end
51
+ end
52
+
53
+ it "should have the single contig" do
54
+ @gff.sequencelist.size.should == 1
55
+ @gff.sequencelist["MhA1_Contig125"].should_not == nil
56
+ @gff.sequencelist["MhA1_Contig125"].size.should == 53702
57
+ end
58
+ PROTEINS.each do | name, seq |
59
+ it "should translate gene #{name}" do
60
+ recs = @cdslist[name]
61
+ component = @componentlist[name]
62
+ cds0 = recs[0]
63
+ cds0.seqname.should == 'MhA1_Contig125'
64
+ # ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
65
+ # p [name, ntseq]
66
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
67
+ aaseq.should == seq
68
+ end
69
+ end
70
+ end
71
+
72
+
73
+
@@ -0,0 +1,62 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble3_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ GFF3FILE3="test/data/gff/test-cds.gff3"
14
+
15
+ describe GFFdb, "Assemble CDS (extra checks)" do
16
+ before :all do
17
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE3)
18
+ @gff = gffdb.assembler
19
+ @gff.parse
20
+ end
21
+
22
+ it "should translate gene MhA1_Contig1040.frz3.gene29" do
23
+ @contigsequence = @gff.sequencelist["MhA1_Contig1040"]
24
+ @componentlist = {}
25
+ @cdslist = {}
26
+ @gff.each_CDS do | id, reclist, component |
27
+ @componentlist[id] = component
28
+ @cdslist[id] = reclist
29
+ end
30
+ name = "cds:MhA1_Contig1040.frz3.gene"
31
+ recs = @cdslist[name]
32
+ component = @componentlist[name]
33
+ p recs
34
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
35
+ ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
36
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
37
+ ntseq.should == "TTATCAACAAATTTAATATTTTCTAATAATACTAATAATATGGCAATTTCAATTCACCCAAAAAAGAATTCTAATGAGGATATTCCCCCATCAACATTATTAACATATCGTTGGTTTCTCTCTTACCGTATGATGACTGCAAGCATGTTATGCCTTTGTTTTTCTAGGCAAATTAATTAA"
38
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
39
+ aaseq.should == "LSTNLIFSNNTNNMAISIHPKKNSNEDIPPSTLLTYRWFLSYRMMTASMLCLCFSRQIN*"
40
+ end
41
+ it "should translate gene MhA1_Contig2992.frz3.gene1" do
42
+ @contigsequence = @gff.sequencelist["MhA1_Contig2992"]
43
+ @componentlist = {}
44
+ @cdslist = {}
45
+ @gff.each_CDS do | id, reclist, component |
46
+ @componentlist[id] = component
47
+ @cdslist[id] = reclist
48
+ end
49
+ name = "cds:MhA1_Contig2992.frz3.gene1"
50
+ recs = @cdslist[name]
51
+ component = @componentlist[name]
52
+ # ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
53
+ # ntseq.should == ""
54
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
55
+ ntseq.should == "AAAATTAATAAAAAAATAAATGATAATTCTTTTAATATTCAATCTGATTCGAATGAAAATTTGTTTAATGATGGAATTAATTCTGAACAAAATGAAGACAATATAGCAACAAAAAAAGGCAACAAAAAATTCGGTAAAAATCAAAAAGAAGGAAATAAAGAGTTGGATATTCAAAGTGAAGGTTTTGATAATAATGAAATACCTTCAAAAGAAAGCAAAAAACAAATAAGTAATTTTGGGGATAATGAAAGTGAATATGAAAAAGAAGAGGATAATAGAAAAAAGAAAGGGAAAAAAGGAATGATAGAAAAGTATGAATTAGGAAGGAATAAAGGAAGGGATAAAAATGAAAGAAATAAGGCTTCTGAAAGGTTTGATGAGCAGAATCAAGACAGAAATAATCAACGTGATAGTTTTGATTCTGGCAATAATGATAAATCACAAAGAGGCTTAGATAGCGGCACATTAGATGGAACAAATAATTTAAAAAGATCGAATGATGATCAATTACCAGAATTTTTGAAAACGGCCAGTCTCTCAGAGCGTCAGAAATTTCTTCAACTTGAAGCAGAAAATGACAGGTCCAAGTCTTCTATACGAAGAGATAAACAGAATTGGGCTGATCAACAAGGGCAGAGAATTTCTGATCTTTATAAACAATTTCAACAATCTTTACAACAAAAAGAAAAACAATTTAAAAGTGAACGTCAACGAAATGTTCAAATTAAATTAAGCAGAAATGCACAGAATGTTGATAAAAGAATTCAGGATCTTCTGAATAATCCTGATATTGCTGAAAGAGCTTTAATTCTTCAAATTGAACAAATCCTCGGCGGTACAGACGATAGTATTCGTCAGGAATTACAAAGACAAATATCTGTTATTGGACCATTAGATGGAAATATACCGCCAAATCTTACATAG"
56
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
57
+ aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
58
+ end
59
+ end
60
+
61
+
62
+
@@ -0,0 +1,291 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ FASTAFILE="test/data/gff/MhA1_Contig1133.fa"
14
+ GFF3FILE="test/data/gff/MhA1_Contig1133.gff3"
15
+
16
+ describe GFFdb, "Assemble CDS" do
17
+ before :all do
18
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE, :fasta_filename => FASTAFILE)
19
+ @gff = gffdb.assembler
20
+ @gff.parse
21
+ @contigsequence = @gff.sequencelist["MhA1_Contig1133"]
22
+ @componentlist = {}
23
+ @cdslist = {}
24
+ @gff.each_CDS do | id, reclist, component |
25
+ @componentlist[id] = component
26
+ @cdslist[id] = reclist
27
+ end
28
+ end
29
+
30
+ it "should have the single contig" do
31
+ @gff.sequencelist.size.should == 1
32
+ @gff.sequencelist["MhA1_Contig1133"].should_not == nil
33
+ @gff.sequencelist["MhA1_Contig1133"].size.should == 33905
34
+ end
35
+ it "should have a container component" do
36
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
37
+ component.start.should == 7838
38
+ component.end.should == 8740
39
+ end
40
+ it "should have CDS 7838:7980" do
41
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
42
+ cds0 = recs[0]
43
+ cds0.start.should == 7838
44
+ cds0.end.should == 7980
45
+ cds0.frame.should == 0
46
+ cds0.seqname.should == 'MhA1_Contig1133'
47
+ end
48
+ it "should have CDS 8065:8308" do
49
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
50
+ cds1 = recs[1]
51
+ cds1.start.should == 8065
52
+ cds1.end.should == 8308
53
+ cds1.frame.should == 1
54
+ cds1.strand.should == '+'
55
+ cds1.seqname.should == 'MhA1_Contig1133'
56
+ end
57
+ # From Wormbase website http://www.wormbase.org/db/gb2/gbrowse/m_hapla/?name=MhA1_Contig1133%3A7838..8740
58
+ # >MhA1_Contig1133:7838..8740
59
+ # atgcgtcctttaacagatgaagaaactgaaaagtttttcaaaaaactttcaaattatatt
60
+ # ggtgacaatattaaacttttattggaaagagaagatggagaatatgtttttcgtttacat
61
+ # aaagacagagtttattattgcaggtttttttaaaattattttatatttaaattaggtctc
62
+ # aatctttataggggattttgtttttgttatttttttttggtttttag>tgaaaaattaatg
63
+ # cgacaagcagcatgtattggacgtaaacaattgggatcttttggaacttgtttgggtaaa
64
+ # ttcacaaaaggagggtctttctttcttcatataacatcattggattatttggcaccttat
65
+ # gctttagcaaaaatttggttaaaaccacaagctgaacaacaatttttatatggaaataat
66
+ # attgttaaatctggtgttggaagaatgagtgaagggattgaagaaaaacaagtaaatatt
67
+ # taattattttttttaaaatggattcctttacttctcaattaaatattaaaagcatatctg
68
+ # tagaagaggttatttatctttaaatcgaaatatacaggaataaataaaaatttaagaaat
69
+ # cataatttagaattctttttctggttatgttagattatttttaaatttttttgtaatttt
70
+ # tttttcgtaatttttttatgagcaaatcccttctctcttaaatattttaataaaaatcta
71
+ # attttataaattataattattttttagggtattattatttataatatgtcagatttacca
72
+ # ttgggttttggagtggctgcaaagggaacattatcttgtagaaaagtagatcctacagct
73
+ # ttagttgttttacatcaatcagatttgggtgaatatattcgaaatgaagagggattaatt
74
+
75
+ it "should translate CDS 7838:7980 (in frame 0, + strand)" do
76
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
77
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
78
+ cds0 = recs[0]
79
+ cds0.seqname.should == 'MhA1_Contig1133'
80
+ seq = @gff.assemble(@contigsequence,component.start,[cds0])
81
+ seq.size.should == 143
82
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
83
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
84
+ aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYC"
85
+ end
86
+ it "should translate CDS 8065:8308 (in frame 1, + strand)" do
87
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
88
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
89
+ cds1 = recs[1]
90
+ seq = @gff.assemble(@contigsequence,component.start,[cds1], :phase => false)
91
+ seq.size.should == 244
92
+ seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
93
+ seq = @gff.assemble(@contigsequence,component.start,[cds1])
94
+ seq.should == "GAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
95
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
96
+ # note it should handle the frame shift and direction!
97
+ aaseq.should == "EKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQ"
98
+ end
99
+ it "should translate CDS3 (in frame 0, + strand)" do
100
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
101
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
102
+ cds3 = recs[2]
103
+ seq = @gff.assemble(@contigsequence,component.start,[cds3], :phase => false)
104
+ seq.size.should == 156
105
+ seq.should == "GGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
106
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds3])
107
+ # note it should handle the frame shift and direction!
108
+ aaseq.should == "GIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
109
+ end
110
+ it "should assemble 3 CDSs for MhA1_Contig1133.frz3.gene4" do
111
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
112
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
113
+ seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>false)
114
+ seq.size.should == 543
115
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
116
+ seq = @gff.assemble(@contigsequence,component.start,recs)
117
+ seq.size.should == 543
118
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
119
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
120
+ aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYCSEKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQGIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
121
+ end
122
+ # > class=Sequence position=MhA1_Contig1133:27463..29904 (- strand); shown in frame 1!
123
+ # ATGGACCATC ATGCATTGGT GGAGGAATTA CCAGAAATTG AAAAATTAAC TCCTCAAGAA CGTATTGCAT TAGCTAGAGA
124
+ # ACGCCGTGCT GAACAACTTC GACAGAATGC TGCACGGGAG GCTCAATTGC CAATGCCTGC ACAGCGCCGG CCTCGTCTTC
125
+ # GATTTACACC AGATGTTGCT TTACTTGAGG CAACAGTTAG GGGTGATACC CAAGAAGGTT ATACATAAAG ATTATTGATT
126
+ # TTAAATGAAT TTATTTATTT TTTAGTTGAA AGACTTTTAA TGGAAGGTGT CAATGCTGAT TCACATAATG AGGATGGATT
127
+ # AACACCTTTA CATCAGGCAA AAACCAAATT AATTTTTTTA AATTTATTTT TAGTGTGCCA TTGACAATAA TGAAAGAATT
128
+ # GTTCGTCTTC TGCTTAGGTA CGGAGCTTGT GTTAATGCCA AAGACACTGA ACTTTGGACA CCATTGCACG CAGCTGCATG
129
+ # TTGTGCTTAT ATTGATATTG TTCGATTGCT TATTGCACAG TTAGTTTTTT TTTAATTTTT TTTTTAAATA AATTTCTTAA
130
+ # GTTTTACAGA AATATTTATT TTAAACAAAC GGGACTTCCT TTTAAATTTT TTGTATTTTT AATCTTTACG TATTTTCATT
131
+ # TAATAATTAA TTCGTCTTCT AAAAGTTCGT AAGTTTTGTG GTTTAGTTTA ATGGGTAAAC ATCCAGTTTT TAGGTCATCG
132
+ # ATTTTTATTT TTGCGTCATA TTTTATCGAA AACTTCTTTC ATATTAAAAA TTTCTTTTTA AGCAACGCAG ATTTACTAGC
133
+ # AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG
134
+ # CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA
135
+ # CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT
136
+ # TTATGTAAAG ATTATTATGA AAGGATTATT ACAGTTTTAT TCCTTTTTAG TTACATATAG CAGCAGCTAA TGGTTATTAT
137
+ # GATGTTGCTG CTTTCCTTCT TCGTTGTAAT GTTTCTCCAG CATTGAGAGA TATAGATTTG TGGCAACCAA TTCATGCAGC
138
+ # TGCTTCTTGG AATCAACCAG ACTTAATCGA GCTTTTATGC GAATATGGGG CTGATATAAA TGCAAAAACT GGAGCTGGGG
139
+ # AAAGCCCTTT AGGTTTATTT TATTGAATCT TATAATTTAT AAATATTTGC TATTAAGTAT GAGGGGAGAG GAACTAACAA
140
+ # TAAGGAATTA AATTTCTCAA TATCAGGATT TTTCGGTTCA CACCCATTTT CTTAAGACCT TTAATTTTTC TCAAAATATG
141
+ # TATGTGACCA CGTCGGGAGG CTTTTTTATT TTTACATGGC TATTTTAAGA AAGGCTAGAA TTTTGACATA CTTTTAACTT
142
+ # ATCGCCTTCC TAACTATTTT CTGTCTATAT ATTTTTTTAA ATTAAGAATT AACTGAAGAT GAACCAACCC AACAAGTAAT
143
+ # TAGAACAATC GCTCAGACAG AAGCAAGGAG ACGGCGTGGT CCAGGTGGTG GTTACTTTGG TGTTCGTGAT TCTCGACGAC
144
+ # AAAGCCGAAA GTAATTTTAA ATTTATATTT TCTTTTCATC TTTTTATCTA GAAGAAAAAA GTTTGAATCT CCTCAACAAC
145
+ # CACCTTCAAC ATTAGAAAAT CCTTTCTCAG CTAGAGGTGC AATTAGACGA CAATCATTGC GAGATCGTAG TGGAATGTCA
146
+ # TTAGCTCGTT TGGAAGCACA AAGAGAGGGT TCTGACCTTA TTAGAAGTTA TAATAGTAAA GAAGACCTTT CTTCTAATAC
147
+ # AGCGGTTTGT TTTTTAAAAT TGTAATTTTT TCTTAATTTT TAGGATGATT CTTTAAATGT TGGAAGTTCT TCATATCTCA
148
+ # ACAATCCAAC AGCCTCGGCT AGTGCTTCCT CTTCAGCATT ACACGGAACT CCACATCAAC AACAACGTCG TGAATCTCCA
149
+ # CCTAAACGTG CATTAATGGC TAGAAGTGCT TCTCATCAAA AACAAAAACA ACAAATGTCT CCAGATGAAT GGCTGAAAAA
150
+ # ATTAGAAGCA GATTCTGCAG GTTTTCGAGA TAATGATGGA GAAGATGGTG AATTACAATC TGAACTTAAA GGAGGACAAA
151
+ # GAATGAAGAG TGGTGGTGGT GGAGGAGCGA GAGGTCAGCA AGGTGAATTA AAATATTTTT TTTGAATTTT ATATTTATTT
152
+ # TTCGTTTAAT AGAAATGAAT GGTGGTCCAA CAGCAACATT TGGTGGAGCT TCAAAACAAC AATTAGCAAT GGGCTCTGGA
153
+ # CCCAATAGAC GGCGCAAACA AGGATGTTGC TCTGTTTTGT GA
154
+ it "should assemble a reverse CDS in MhA1_Contig1133.frz3.gene11" do
155
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
156
+ recs.size.should == 8
157
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
158
+ # 193 bp from MhA1_Contig1133:27,981..28,173
159
+ # >MhA1_Contig1133:27981..28173
160
+ # cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
161
+ # tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
162
+ # aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
163
+ # aaacttttttctt
164
+ cds1 = recs[5]
165
+ cds1.start.should == 27981
166
+ cds1.frame.should == 1
167
+ cds1.strand.should == '-'
168
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true)
169
+ seq.should == "TCTTTTTTCAAACTTAGAGGAGTTGTTGGTGGAAGTTGTAATCTTTTAGGAAAGAGTCGATCTCCACGTTAATCTGCTGTTAGTAACGCTCTAGCATCACCTTACAGTAATCGAGCAAACCTTCGTGTTTCTCTCCCAAGACTGGAATAATCTTCAATATTATCATTTCTTCTGGAAAGAAGATTATGTCGC"
170
+ seq.size.should == 192
171
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true,:complement=>true)
172
+ seq.should == "AGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCG"
173
+ seq.size.should == 192
174
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
175
+ # note it should handle the frame shift and direction!
176
+ # >EMBOSS_001_4
177
+ # RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
178
+ aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
179
+ end
180
+ it "should take the 6th CDS in MhA1_Contig1133.frz3.gene11 (which is 3rd on DNA)" do
181
+ # >MhA1_Contig1133:27981..28173
182
+ # cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
183
+ # tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
184
+ # aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
185
+ # aaacttttttctt
186
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
187
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
188
+ cds2 = recs[5].clone
189
+ # p cds2
190
+ cds2.start.should == 27981
191
+ cds2.frame.should == 1
192
+ cds2.strand.should == '-'
193
+ seq = @gff.assemble(@contigsequence,component.start,[cds2],:complement=>true)
194
+ seq.should == "GCGACATAATCTTCTTTCCAGAAGAAATGATAATATTGAAGATTATTCCAGTCTTGGGAGAGAAACACGAAGGTTTGCTCGATTACTGTAAGGTGATGCTAGAGCGTTACTAACAGCAGATTAACGTGGAGATCGACTCTTTCCTAAAAGATTACAACTTCCACCAACAACTCCTCTAAGTTTGAAAAAAGAA"
195
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
196
+ # note it should handle the frame shift and direction!
197
+ # >27981..28173_4 RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
198
+ aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
199
+ end
200
+ it "should assemble the 1st reverse CDS in MhA1_Contig1133.frz3.gene11" do
201
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
202
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
203
+ cds1 = recs[0].clone
204
+ cds1.start.should == 29710
205
+ cds1.frame.should == 0
206
+ cds1.strand.should == '-'
207
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:raw=>true)
208
+ seq.size.should == 195
209
+ seq.should == "TGTTGCCTCAAGTAAAGCAACATCTGGTGTAAATCGAAGACGAGGCCGGCGCTGTGCAGGCATTGGCAATTGAGCCTCCCGTGCAGCATTCTGTCGAAGTTGTTCAGCACGGCGTTCTCTAGCTAATGCAATACGTTCTTGAGGAGTTAATTTTTCAATTTCTGGTAATTCCTCCACCAATGCATGATGGTCCAT"
210
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:codonize=>true)
211
+ seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACA"
212
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
213
+ aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEAT"
214
+ end
215
+ it "should assemble the 3rd reverse CDS in MhA1_Contig1133.frz3.gene11" do
216
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
217
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
218
+ cds2 = recs[2].clone
219
+ # CAACGCAG ATTTACTAGC AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT TTAT
220
+ # p cds2
221
+ cds2.frame.should == 1
222
+ cds2.strand.should == '-'
223
+ seq = @gff.assemble(@contigsequence,component.start,[cds2], :raw=>true)
224
+ seq.should == "ATAAATTTCCCTTTCTCCAGAAAAACTTACAAAAGTAGATTTATCAACAGAATTTCTTTGATCTAAAGGTAATCCTCTTTGATGTAAAATTTTCATATCATTTAACATTTCCCTTTCTGGTTGTTGTCTTCTTTCATCAATCATTTCTTGTGTAATTCCTCTAGCAGCCATTTCAGATTCAATAAGGTCAAGGGTTTGTTCATCATCACAAATATCATAAGGCATATTACCATCTGCATTTACTGCTAGTAAATCTGCGTTG"
225
+ seq = @gff.assemble(@contigsequence,component.start,[cds2], :codonize=>true)
226
+ seq.should == "AACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTAT"
227
+ # cds1.frame = 1
228
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
229
+ # note it should handle the frame shift and direction!
230
+ aaseq.should == "NADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIY"
231
+ end
232
+ it "should assemble the protein sequence for MhA1_Contig1133.frz3.gene11" do
233
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
234
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
235
+ seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>true, :reverse=>true, :complement=>true)
236
+ seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACATGTGCCATTGACAATAATGAAAGAATTGTTCGTCTTCTGCTTAGGTACGGAGCTTGTGTTAATGCCAAAGACACTGAACTTTGGACACCATTGCACGCAGCTGCATGTTGTGCTTATATTGATATTGTTCGATTGCTTATTGCACACAACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTATTTACATATAGCAGCAGCTAATGGTTATTATGATGTTGCTGCTTTCCTTCTTCGTTGTAATGTTTCTCCAGCATTGAGAGATATAGATTTGTGGCAACCAATTCATGCAGCTGCTTCTTGGAATCAACCAGACTTAATCGAGCTTTTATGCGAATATGGGGCTGATATAAATGCAAAAACTGGAGCTGGGGAAAGCCCTTTAGAATTAACTGAAGATGAACCAACCCAACAAGTAATTAGAACAATCGCTCAGACAGAAGCAAGGAGACGGCGTGGTCCAGGTGGTGGTTACTTTGGTGTTCGTGATTCTCGACGACAAAGCCGAAAAAGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCGGATGATTCTTTAAATGTTGGAAGTTCTTCATATCTCAACAATCCAACAGCCTCGGCTAGTGCTTCCTCTTCAGCATTACACGGAACTCCACATCAACAACAACGTCGTGAATCTCCACCTAAACGTGCATTAATGGCTAGAAGTGCTTCTCATCAAAAACAAAAACAACAAATGTCTCCAGATGAATGGCTGAAAAAATTAGAAGCAGATTCTGCAGGTTTTCGAGATAATGATGGAGAAGATGGTGAATTACAATCTGAACTTAAAGGAGGACAAAGAATGAAGAGTGGTGGTGGTGGAGGAGCGAGAGGTCAGCAAGAAATGAATGGTGGTCCAACAGCAACATTTGGTGGAGCTTCAAAACAACAATTAGCAATGGGCTCTGGACCCAATAGACGGCGCAAACAAGGATGTTGCTCTGTTTTGTGA"
237
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
238
+ aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAAASWNQPDLIELLCEYGADINAKTGAGESPLELTEDEPTQQVIRTIAQTEARRRRGPGGGYFGVRDSRRQSRKRKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTADDSLNVGSSSYLNNPTASASASSSALHGTPHQQQRRESPPKRALMARSASHQKQKQQMSPDEWLKKLEADSAGFRDNDGEDGELQSELKGGQRMKSGGGGGARGQQEMNGGPTATFGGASKQQLAMGSGPNRRRKQGCCSVL*"
239
+ # >EMBOSS_001_1
240
+ # MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVA
241
+ # LLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLA
242
+ # VNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPL
243
+ # DQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAA...
244
+
245
+ end
246
+ it "should assemble exons into"
247
+ it "should assemble the gene into"
248
+ # >MhA1_Contig1133:27463..29904
249
+ # tcacaaaacagagcaacatccttgtttgcgccgtctattgggtccagagcccattgctaa
250
+ # ttgttgttttgaagctccaccaaatgttgctgttggaccaccattcatttctattaaacg
251
+ # aaaaataaatataaaattcaaaaaaaatattttaattcaccttgctgacctctcgctcct
252
+ # ccaccaccaccactcttcattctttgtcctcctttaagttcagattgtaattcaccatct
253
+ # tctccatcattatctcgaaaacctgcagaatctgcttctaattttttcagccattcatct
254
+ # ggagacatttgttgtttttgtttttgatgagaagcacttctagccattaatgcacgttta
255
+ # ggtggagattcacgacgttgttgttgatgtggagttccgtgtaatgctgaagaggaagca
256
+ # ctagccgaggctgttggattgttgagatatgaagaacttccaacatttaaagaatcatcc
257
+ # taaaaattaagaaaaaattacaattttaaaaaacaaaccgctgtattagaagaaaggtct
258
+ # tctttactattataacttctaataaggtcagaaccctctctttgtgcttccaaacgagct
259
+ # aatgacattccactacgatctcgcaatgattgtcgtctaattgcacctctagctgagaaa
260
+ # ggattttctaatgttgaaggtggttgttgaggagattcaaacttttttcttctagataaa
261
+ # aagatgaaaagaaaatataaatttaaaattactttcggctttgtcgtcgagaatcacgaa
262
+ # caccaaagtaaccaccacctggaccacgccgtctccttgcttctgtctgagcgattgttc
263
+ # taattacttgttgggttggttcatcttcagttaattcttaatttaaaaaaatatatagac
264
+ # agaaaatagttaggaaggcgataagttaaaagtatgtcaaaattctagcctttcttaaaa
265
+ # tagccatgtaaaaataaaaaagcctcccgacgtggtcacatacatattttgagaaaaatt
266
+ # aaaggtcttaagaaaatgggtgtgaaccgaaaaatcctgatattgagaaatttaattcct
267
+ # tattgttagttcctctcccctcatacttaatagcaaatatttataaattataagattcaa
268
+ # taaaataaacctaaagggctttccccagctccagtttttgcatttatatcagccccatat
269
+ # tcgcataaaagctcgattaagtctggttgattccaagaagcagctgcatgaattggttgc
270
+ # cacaaatctatatctctcaatgctggagaaacattacaacgaagaaggaaagcagcaaca
271
+ # tcataataaccattagctgctgctatatgtaactaaaaaggaataaaactgtaataatcc
272
+ # tttcataataatctttacataaatttccctttctccagaaaaacttacaaaagtagattt
273
+ # atcaacagaatttctttgatctaaaggtaatcctctttgatgtaaaattttcatatcatt
274
+ # taacatttccctttctggttgttgtcttctttcatcaatcatttcttgtgtaattcctct
275
+ # agcagccatttcagattcaataaggtcaagggtttgttcatcatcacaaatatcataagg
276
+ # catattaccatctgcatttactgctagtaaatctgcgttgcttaaaaagaaatttttaat
277
+ # atgaaagaagttttcgataaaatatgacgcaaaaataaaaatcgatgacctaaaaactgg
278
+ # atgtttacccattaaactaaaccacaaaacttacgaacttttagaagacgaattaattat
279
+ # taaatgaaaatacgtaaagattaaaaatacaaaaaatttaaaaggaagtcccgtttgttt
280
+ # aaaataaatatttctgtaaaacttaagaaatttatttaaaaaaaaaattaaaaaaaaact
281
+ # aactgtgcaataagcaatcgaacaatatcaatataagcacaacatgcagctgcgtgcaat
282
+ # ggtgtccaaagttcagtgtctttggcattaacacaagctccgtacctaagcagaagacga
283
+ # acaattctttcattattgtcaatggcacactaaaaataaatttaaaaaaattaatttggt
284
+ # ttttgcctgatgtaaaggtgttaatccatcctcattatgtgaatcagcattgacaccttc
285
+ # cattaaaagtctttcaactaaaaaataaataaattcatttaaaatcaataatctttatgt
286
+ # ataaccttcttgggtatcacccctaactgttgcctcaagtaaagcaacatctggtgtaaa
287
+ # tcgaagacgaggccggcgctgtgcaggcattggcaattgagcctcccgtgcagcattctg
288
+ # tcgaagttgttcagcacggcgttctctagctaatgcaatacgttcttgaggagttaattt
289
+ # ttcaatttctggtaattcctccaccaatgcatgatggtccat
290
+ end
291
+