bio-gff3 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ # LRU based Hash by Robert Klemme
2
+ #
3
+ # Copied from http://github.com/rklemme/muppet-laboratories
4
+ #
5
+ # License: unclear, not in repository, need to ask Permission
6
+ # Added by Pjotr Prins (pjotr.prins@thebird.nl)
7
+
8
+ require 'enumerator'
9
+
10
+ # Hash with LRU expiry policy. There are at most max_size elements in a
11
+ # LruHash. When adding more elements old elements are removed according
12
+ # to LRU policy.
13
+ class LRUHash
14
+ include Enumerable
15
+
16
+ attr_reader :max_size
17
+ attr_accessor :default, :default_proc, :release_proc
18
+
19
+ def initialize(max_size, default_value = nil, &block)
20
+ @max_size = normalize_max(max_size)
21
+ @default = default_value
22
+ @default_proc = block
23
+
24
+ @h = {}
25
+ @head = Node.new
26
+ @tail = front(Node.new)
27
+ end
28
+
29
+ def each_pair
30
+ if block_given?
31
+ each_node do |n|
32
+ yield [n.key, n.value]
33
+ end
34
+ else
35
+ enum_for :each_pair
36
+ end
37
+ end
38
+
39
+ alias each each_pair
40
+
41
+ def each_key
42
+ if block_given?
43
+ each_node do |n|
44
+ yield n.key
45
+ end
46
+ else
47
+ enum_for :each_key
48
+ end
49
+ end
50
+
51
+ def each_value
52
+ if block_given?
53
+ each_node do |n|
54
+ yield n.value
55
+ end
56
+ else
57
+ enum_for :each_value
58
+ end
59
+ end
60
+
61
+ def size
62
+ @h.size
63
+ end
64
+
65
+ def empty?
66
+ @head.succ.equal? @tail
67
+ end
68
+
69
+ def fetch(key, &b)
70
+ n = @h[key]
71
+
72
+ if n
73
+ front(n).value
74
+ else
75
+ (b || FETCH)[key]
76
+ end
77
+ end
78
+
79
+ def [](key)
80
+ fetch(key) do |k|
81
+ @default_proc ? @default_proc[self, k] : default
82
+ end
83
+ end
84
+
85
+ def keys
86
+ @h.keys
87
+ end
88
+
89
+ def values
90
+ @h.map {|k,n| n.value}
91
+ end
92
+
93
+ def has_key?(key)
94
+ @h.has_key? key
95
+ end
96
+
97
+ alias key? has_key?
98
+ alias member? has_key?
99
+ alias include? has_key?
100
+
101
+ def has_value?(value)
102
+ each_pair do |k, v|
103
+ return true if value.eql? v
104
+ end
105
+
106
+ false
107
+ end
108
+
109
+ alias value? has_value?
110
+
111
+ def values_at(*key_list)
112
+ key_list.map {|k| self[k]}
113
+ end
114
+
115
+ def assoc(key)
116
+ n = @h[key]
117
+
118
+ if n
119
+ front(n)
120
+ [n.key, n.value]
121
+ end
122
+ end
123
+
124
+ def rassoc(value)
125
+ each_node do |n|
126
+ if value.eql? n.value
127
+ front(n)
128
+ return [n.key, n.value]
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def key(value)
135
+ pair = rassoc(value) and pair.first
136
+ end
137
+
138
+ def store(key, value)
139
+ # same optimization as in Hash
140
+ key = key.dup.freeze if String === key && !key.frozen?
141
+
142
+ n = @h[key]
143
+
144
+ unless n
145
+ if size == max_size
146
+ # reuse node to optimize memory usage
147
+ n = delete_oldest
148
+ n.key = key
149
+ n.value = value
150
+ else
151
+ n = Node.new key, value
152
+ end
153
+
154
+ @h[key] = n
155
+ end
156
+
157
+ front(n).value = value
158
+ end
159
+
160
+ alias []= store
161
+
162
+ def delete(key)
163
+ n = @h[key] and remove_node(n).value
164
+ end
165
+
166
+ def delete_if
167
+ each_node do |n|
168
+ remove_node n if yield n.key, n.value
169
+ end
170
+ end
171
+
172
+ def max_size=(limit)
173
+ limit = normalize_max(limit)
174
+
175
+ while size > limit
176
+ delete_oldest
177
+ end
178
+
179
+ @max_size = limit
180
+ end
181
+
182
+ def clear
183
+ until empty?
184
+ delete_oldest
185
+ end
186
+
187
+ self
188
+ end
189
+
190
+ def to_s
191
+ s = nil
192
+ each_pair {|k, v| (s ? (s << ', ') : s = '{') << k.to_s << '=>' << v.to_s}
193
+ s ? (s << '}') : '{}'
194
+ end
195
+
196
+ alias inspect to_s
197
+
198
+ FETCH = Proc.new {|k| raise KeyError, 'key not found'}
199
+
200
+ # A single node in the doubly linked LRU list of nodes
201
+ Node = Struct.new :key, :value, :pred, :succ do
202
+ def unlink
203
+ pred.succ = succ if pred
204
+ succ.pred = pred if succ
205
+ self.succ = self.pred = nil
206
+ self
207
+ end
208
+
209
+ def insert_after(node)
210
+ raise 'Cannot insert after self' if equal? node
211
+ return self if node.succ.equal? self
212
+
213
+ unlink
214
+
215
+ self.succ = node.succ
216
+ self.pred = node
217
+
218
+ node.succ.pred = self if node.succ
219
+ node.succ = self
220
+
221
+ self
222
+ end
223
+ end
224
+
225
+ private
226
+ # iterate nodes
227
+ def each_node
228
+ n = @head.succ
229
+
230
+ until n.equal? @tail
231
+ succ = n.succ
232
+ yield n
233
+ n = succ
234
+ end
235
+
236
+ self
237
+ end
238
+
239
+ # move node to front
240
+ def front(node)
241
+ node.insert_after(@head)
242
+ end
243
+
244
+ # remove the node and invoke release_proc
245
+ # if set
246
+ def remove_node(node)
247
+ n = @h.delete(node.key)
248
+ n.unlink
249
+ release_proc and release_proc[n.key, n.value]
250
+ n
251
+ end
252
+
253
+ # remove the oldest node returning the node
254
+ def delete_oldest
255
+ n = @tail.pred
256
+ raise "Cannot delete from empty hash" if @head.equal? n
257
+ remove_node n
258
+ end
259
+
260
+ # Normalize the argument in order to be usable as max_size
261
+ # criterion is that n.to_i must be an Integer and it must
262
+ # be larger than zero.
263
+ def normalize_max(n)
264
+ n = n.to_i
265
+ raise ArgumentError, 'Invalid max_size: %p' % n unless Integer === n && n > 0
266
+ n
267
+ end
268
+ end
@@ -0,0 +1,73 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble2_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ FASTAFILE2="test/data/gff/MhA1_Contig125.fa"
14
+ GFF3FILE2="test/data/gff/MhA1_Contig125.gff3"
15
+
16
+ PROTEINS = {
17
+ "cds:MhA1_Contig125.frz3.gene2" =>
18
+ "MNDLVNQFKSAALAVGQYLTPVLRESKFKETGVLTPEEFVAAGDHLVHLCPTWSWAKASDSNGQTTFLITKQSALVTQRCAQIMGYDEILKEKIIKDESAETGDEQNEWVDTHHFDFETNCAPKDFEEEENKVEDIKENNLNEEENCEEEEEGEPIDLDEYLSSGLLEEEDPARFVLQNKSLKETKDDSTSNNLLRTRRYDLHITYDKYYQVPRFWLVGYDENGSPLAVDKMKEDFSQEHADKTITLESHPHISGLTLATIHPCRHAPVMKRLIEQFQESGKELLVIDYLFVFLKFVQAVIPTVEYDYTRSIHF*",
19
+ "cds:MhA1_Contig125.frz3.gene3" =>
20
+ "MERRKVSNTDPFEAAEGMLRWNSDIIKDKEIKQFKGLKKPLKLSENQNDEYDVDPFEAVTDWLPLNKNVDKT*",
21
+ "cds:MhA1_Contig125.frz3.gene4" =>
22
+ "MKSTKMSATEIVSYHLYSLHTLSSFCLTENPENIFIKDQNFQDFFLFCERVREQFNEAEELKTPLNTKISQTDSTNIQNKKDEPSISIGPCVNDLCPKGFECIENICFKSMEMPKTERVLSIGPCVNAKCPEGFSCYEDDRQCYAN*",
23
+ "cds:MhA1_Contig125.frz3.gene5" =>
24
+ "MRLDIFLVIAFSLGVAVNCGVVKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPSGGETGGAAAPAEAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAEAAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGAGGAAGAAPAAAAAPAGGEAAAPAAGGCTEGCAAGGESAAPAAAPAPEAAPAAAPAPEQAAPAVAQPAPAAAPSQASGYRKKRSQNSYGDEAGHAAGAAPAESAAPAAGGEAAAPSGGETGGAAAPAETAPAAAPEAAPAAAPEQAAPAVAQPAPAAAPSQASGY*",
25
+ "cds:MhA1_Contig125.frz3.gene6" =>
26
+ "MDNAENEEKNDKENHEKPIKFEINNQKQFFKKEDEVKECEEESKNVNEFEEDQGTNEVSEVLRLFRRGNVWAFALQNLDLMRAYVILSCLAIAVVMLANFLRNSRFFDFCLK*",
27
+ "cds:MhA1_Contig125.frz3.gene7" =>
28
+ "MLFCLLHPMDHNTGPLARKSSTLCSLLLLSIAALLVLAVPGQANSEEVGFGNHTKEKDGDEVTVNIDSVQAPDDLTYAVYEKRFKDVCEFVITKDDIELLYKGKGCTVELLTGENQDITFKTGVKDIGCVRNDCDKASLYSSVGEVEPGLSQSVTDGKTEFELRISGSEFNMNFEEDAPFNPQKNRCAPKQDHIVKPETWRIKNGELKDKHLLVFHLLPKTATREYTKEGKISKEQPPEEAPKCKLFIRFKRPYYEFLYVGPITTTVTTTTTTTTTPSSGLVGQGPTPKTGTHQGNTPKVQGKGSEKESDNTMMIVIIVIVVVVVVLVIGVVLIFILKNKGSKEDELQKVKQTTTKANKSSAVTL*",
29
+ "cds:MhA1_Contig125.frz3.gene8" =>
30
+ "MRQRESVILNKTENQTQIFEKLLNLYNSPKDVVNLRNNPEQLIQLGIDSKQFSAILEMMFGARRRNSLRGDYREARRFRNRREYSAWWDAGEVNNWRINSRHPSRHGTVEYWRCAFAVGRFFTCPSRIRITFGFGDRYVIVANARNHPHNHNRQNNAGDNNPNTVRRALPMEANERLTARTVHIGPRPSTSAPNQPTTKGQAAPPRASVSTTSANAAATPTTSASSTVQKGTAAPSTSAAPSTSAAPSTSAASRPLKPPGFATAATSATNSQQAAAKPASNQQPAPTATTSQTSASAPGTSSKPPPTTSPAPAATPAPATSQPGTSTVKSAPASTPTPLKPPAATEKQTSQPPSAAPGTTATIKPVLVTNIPGLPPGIPTSATGSGTINVSLTALDAFLAGHPRPASTSAPASSQTTPGPASQPSSAPVTQNKGKEEKKEDKKEEKKEGKKEEKKEGKKEEKRGPG*",
31
+ "cds:MhA1_Contig125.frz3.gene9" =>
32
+ "MHGATIGNRLRATRRSRDAQMMAAAESVARLSRRHSHQKAIRRVLPPPPALNSSRDSQPINPFCSDPSSIQPVIAKGVCVRSVGVFKSALPPSTPFPSTSTAPNIPSDNTFVPHLNNSTPLHNNHHRTLGGSENCLNYQQQYIGGSYSARSQQQHPPPPAPSSCCISPFKPLEILGNSNGTTDSSSGGCNSARAAMHRQFTGSSNGEEEFTVEELQEFAQAFKMFDKDGNGTMSIKELGVAMRTLGLNPTEDELLNMVNEYDVDGNGIDFSEFCKMMKEMNKETDQELIRLAFRVFDKDGNGYITAQEFRHFMTIDYEEFVNAVAPIVNDGAKEDAPFFEKEQPTSFGQPITSGPPLASGKAKHF*",
33
+ "cds:MhA1_Contig125.frz3.gene21" =>
34
+ "MDVKPPPSAPQDIKEAIKESNMSTWRPFLIGNRMRTTSEDSAESFDAYDKSFDAYDVGNKKERRLSITEQFFGSSMPGRLRSNSTTEYEGHEHEPTFKKVDLKQFMKHQRKILGDDEWQ*",
35
+ "cds:MhA1_Contig125.frz3.gene22" =>
36
+ "MAKALISGFVSSGFISKSNISICTRSEATAKSWRLQGFTSAYSKDVFYSEVKKPRAIILIAVKPQIFPSFINEVKANEWFYFGVPGILCISIMSGISLQHFDKEMKSVGFDGHSMRLMPNVNCAVSTGTLVLSADPETPQELVTLVSVLSSYVGKCIRVDEAHFNAASSISGCGPAFIALVIEALADGGVVAGLSRELANQLAADTVKGTGHLFMTKMASVSPTSDNPSPAQLKDQVCSPAGTTIEGVRELEKHGVRSAFIEAIQASTRRAFELSQ*" }
37
+
38
+ describe GFFdb, "Assemble CDS (Contig125)" do
39
+ before :all do
40
+ # gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2,:cache_components => :cache_none, :cache_records => :cache_none)
41
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE2, :fasta_filename => FASTAFILE2)
42
+ @gff = gffdb.assembler
43
+ @gff.parse
44
+ @contigsequence = @gff.sequencelist["MhA1_Contig125"]
45
+ @componentlist = {}
46
+ @cdslist = {}
47
+ @gff.each_CDS do | id, reclist, component |
48
+ @componentlist[id] = component
49
+ @cdslist[id] = reclist
50
+ end
51
+ end
52
+
53
+ it "should have the single contig" do
54
+ @gff.sequencelist.size.should == 1
55
+ @gff.sequencelist["MhA1_Contig125"].should_not == nil
56
+ @gff.sequencelist["MhA1_Contig125"].size.should == 53702
57
+ end
58
+ PROTEINS.each do | name, seq |
59
+ it "should translate gene #{name}" do
60
+ recs = @cdslist[name]
61
+ component = @componentlist[name]
62
+ cds0 = recs[0]
63
+ cds0.seqname.should == 'MhA1_Contig125'
64
+ # ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
65
+ # p [name, ntseq]
66
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
67
+ aaseq.should == seq
68
+ end
69
+ end
70
+ end
71
+
72
+
73
+
@@ -0,0 +1,62 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble3_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ GFF3FILE3="test/data/gff/test-cds.gff3"
14
+
15
+ describe GFFdb, "Assemble CDS (extra checks)" do
16
+ before :all do
17
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE3)
18
+ @gff = gffdb.assembler
19
+ @gff.parse
20
+ end
21
+
22
+ it "should translate gene MhA1_Contig1040.frz3.gene29" do
23
+ @contigsequence = @gff.sequencelist["MhA1_Contig1040"]
24
+ @componentlist = {}
25
+ @cdslist = {}
26
+ @gff.each_CDS do | id, reclist, component |
27
+ @componentlist[id] = component
28
+ @cdslist[id] = reclist
29
+ end
30
+ name = "cds:MhA1_Contig1040.frz3.gene"
31
+ recs = @cdslist[name]
32
+ component = @componentlist[name]
33
+ p recs
34
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
35
+ ntseq.should == "TTAATTAATTTGCCTAGAAAAACAAAGGCATAACATGCTTGCAGTCATCATACGGTAAGAGAGAAACCAACGATATGTTAATAATGTTGATGGGGGAATATCCTCATTAGAATTCTTTTTTGGGTGAATTGAAATTGCCATATTATTAGTATTATTAGAAAATATTAAATTTGTTGATAA"
36
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
37
+ ntseq.should == "TTATCAACAAATTTAATATTTTCTAATAATACTAATAATATGGCAATTTCAATTCACCCAAAAAAGAATTCTAATGAGGATATTCCCCCATCAACATTATTAACATATCGTTGGTTTCTCTCTTACCGTATGATGACTGCAAGCATGTTATGCCTTTGTTTTTCTAGGCAAATTAATTAA"
38
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
39
+ aaseq.should == "LSTNLIFSNNTNNMAISIHPKKNSNEDIPPSTLLTYRWFLSYRMMTASMLCLCFSRQIN*"
40
+ end
41
+ it "should translate gene MhA1_Contig2992.frz3.gene1" do
42
+ @contigsequence = @gff.sequencelist["MhA1_Contig2992"]
43
+ @componentlist = {}
44
+ @cdslist = {}
45
+ @gff.each_CDS do | id, reclist, component |
46
+ @componentlist[id] = component
47
+ @cdslist[id] = reclist
48
+ end
49
+ name = "cds:MhA1_Contig2992.frz3.gene1"
50
+ recs = @cdslist[name]
51
+ component = @componentlist[name]
52
+ # ntseq = @gff.assemble(@contigsequence,component.start,recs,:raw=>true)
53
+ # ntseq.should == ""
54
+ ntseq = @gff.assemble(@contigsequence,component.start,recs,:codonize=>true)
55
+ ntseq.should == "AAAATTAATAAAAAAATAAATGATAATTCTTTTAATATTCAATCTGATTCGAATGAAAATTTGTTTAATGATGGAATTAATTCTGAACAAAATGAAGACAATATAGCAACAAAAAAAGGCAACAAAAAATTCGGTAAAAATCAAAAAGAAGGAAATAAAGAGTTGGATATTCAAAGTGAAGGTTTTGATAATAATGAAATACCTTCAAAAGAAAGCAAAAAACAAATAAGTAATTTTGGGGATAATGAAAGTGAATATGAAAAAGAAGAGGATAATAGAAAAAAGAAAGGGAAAAAAGGAATGATAGAAAAGTATGAATTAGGAAGGAATAAAGGAAGGGATAAAAATGAAAGAAATAAGGCTTCTGAAAGGTTTGATGAGCAGAATCAAGACAGAAATAATCAACGTGATAGTTTTGATTCTGGCAATAATGATAAATCACAAAGAGGCTTAGATAGCGGCACATTAGATGGAACAAATAATTTAAAAAGATCGAATGATGATCAATTACCAGAATTTTTGAAAACGGCCAGTCTCTCAGAGCGTCAGAAATTTCTTCAACTTGAAGCAGAAAATGACAGGTCCAAGTCTTCTATACGAAGAGATAAACAGAATTGGGCTGATCAACAAGGGCAGAGAATTTCTGATCTTTATAAACAATTTCAACAATCTTTACAACAAAAAGAAAAACAATTTAAAAGTGAACGTCAACGAAATGTTCAAATTAAATTAAGCAGAAATGCACAGAATGTTGATAAAAGAATTCAGGATCTTCTGAATAATCCTGATATTGCTGAAAGAGCTTTAATTCTTCAAATTGAACAAATCCTCGGCGGTACAGACGATAGTATTCGTCAGGAATTACAAAGACAAATATCTGTTATTGGACCATTAGATGGAAATATACCGCCAAATCTTACATAG"
56
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
57
+ aaseq.should == "KINKKINDNSFNIQSDSNENLFNDGINSEQNEDNIATKKGNKKFGKNQKEGNKELDIQSEGFDNNEIPSKESKKQISNFGDNESEYEKEEDNRKKKGKKGMIEKYELGRNKGRDKNERNKASERFDEQNQDRNNQRDSFDSGNNDKSQRGLDSGTLDGTNNLKRSNDDQLPEFLKTASLSERQKFLQLEAENDRSKSSIRRDKQNWADQQGQRISDLYKQFQQSLQQKEKQFKSERQRNVQIKLSRNAQNVDKRIQDLLNNPDIAERALILQIEQILGGTDDSIRQELQRQISVIGPLDGNIPPNLT*"
58
+ end
59
+ end
60
+
61
+
62
+
@@ -0,0 +1,291 @@
1
+ # RSpec for BioRuby-GFF3-Plugin. Run with something like:
2
+ #
3
+ # ruby -I ../bioruby/lib/ ~/.gems/bin/spec spec/gff3_assemble_spec.rb
4
+ #
5
+ # Copyright (C) 2010 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+ $: << "../lib"
8
+
9
+ require 'bio/db/gff/gffdb'
10
+
11
+ include Bio::GFFbrowser
12
+
13
+ FASTAFILE="test/data/gff/MhA1_Contig1133.fa"
14
+ GFF3FILE="test/data/gff/MhA1_Contig1133.gff3"
15
+
16
+ describe GFFdb, "Assemble CDS" do
17
+ before :all do
18
+ gffdb = Bio::GFFbrowser::GFFdb.new(GFF3FILE, :fasta_filename => FASTAFILE)
19
+ @gff = gffdb.assembler
20
+ @gff.parse
21
+ @contigsequence = @gff.sequencelist["MhA1_Contig1133"]
22
+ @componentlist = {}
23
+ @cdslist = {}
24
+ @gff.each_CDS do | id, reclist, component |
25
+ @componentlist[id] = component
26
+ @cdslist[id] = reclist
27
+ end
28
+ end
29
+
30
+ it "should have the single contig" do
31
+ @gff.sequencelist.size.should == 1
32
+ @gff.sequencelist["MhA1_Contig1133"].should_not == nil
33
+ @gff.sequencelist["MhA1_Contig1133"].size.should == 33905
34
+ end
35
+ it "should have a container component" do
36
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
37
+ component.start.should == 7838
38
+ component.end.should == 8740
39
+ end
40
+ it "should have CDS 7838:7980" do
41
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
42
+ cds0 = recs[0]
43
+ cds0.start.should == 7838
44
+ cds0.end.should == 7980
45
+ cds0.frame.should == 0
46
+ cds0.seqname.should == 'MhA1_Contig1133'
47
+ end
48
+ it "should have CDS 8065:8308" do
49
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
50
+ cds1 = recs[1]
51
+ cds1.start.should == 8065
52
+ cds1.end.should == 8308
53
+ cds1.frame.should == 1
54
+ cds1.strand.should == '+'
55
+ cds1.seqname.should == 'MhA1_Contig1133'
56
+ end
57
+ # From Wormbase website http://www.wormbase.org/db/gb2/gbrowse/m_hapla/?name=MhA1_Contig1133%3A7838..8740
58
+ # >MhA1_Contig1133:7838..8740
59
+ # atgcgtcctttaacagatgaagaaactgaaaagtttttcaaaaaactttcaaattatatt
60
+ # ggtgacaatattaaacttttattggaaagagaagatggagaatatgtttttcgtttacat
61
+ # aaagacagagtttattattgcaggtttttttaaaattattttatatttaaattaggtctc
62
+ # aatctttataggggattttgtttttgttatttttttttggtttttag>tgaaaaattaatg
63
+ # cgacaagcagcatgtattggacgtaaacaattgggatcttttggaacttgtttgggtaaa
64
+ # ttcacaaaaggagggtctttctttcttcatataacatcattggattatttggcaccttat
65
+ # gctttagcaaaaatttggttaaaaccacaagctgaacaacaatttttatatggaaataat
66
+ # attgttaaatctggtgttggaagaatgagtgaagggattgaagaaaaacaagtaaatatt
67
+ # taattattttttttaaaatggattcctttacttctcaattaaatattaaaagcatatctg
68
+ # tagaagaggttatttatctttaaatcgaaatatacaggaataaataaaaatttaagaaat
69
+ # cataatttagaattctttttctggttatgttagattatttttaaatttttttgtaatttt
70
+ # tttttcgtaatttttttatgagcaaatcccttctctcttaaatattttaataaaaatcta
71
+ # attttataaattataattattttttagggtattattatttataatatgtcagatttacca
72
+ # ttgggttttggagtggctgcaaagggaacattatcttgtagaaaagtagatcctacagct
73
+ # ttagttgttttacatcaatcagatttgggtgaatatattcgaaatgaagagggattaatt
74
+
75
+ it "should translate CDS 7838:7980 (in frame 0, + strand)" do
76
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
77
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
78
+ cds0 = recs[0]
79
+ cds0.seqname.should == 'MhA1_Contig1133'
80
+ seq = @gff.assemble(@contigsequence,component.start,[cds0])
81
+ seq.size.should == 143
82
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAG"
83
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds0])
84
+ aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYC"
85
+ end
86
+ it "should translate CDS 8065:8308 (in frame 1, + strand)" do
87
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
88
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
89
+ cds1 = recs[1]
90
+ seq = @gff.assemble(@contigsequence,component.start,[cds1], :phase => false)
91
+ seq.size.should == 244
92
+ seq.should == "TGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
93
+ seq = @gff.assemble(@contigsequence,component.start,[cds1])
94
+ seq.should == "GAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAA"
95
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
96
+ # note it should handle the frame shift and direction!
97
+ aaseq.should == "EKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQ"
98
+ end
99
+ it "should translate CDS3 (in frame 0, + strand)" do
100
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
101
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
102
+ cds3 = recs[2]
103
+ seq = @gff.assemble(@contigsequence,component.start,[cds3], :phase => false)
104
+ seq.size.should == 156
105
+ seq.should == "GGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
106
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds3])
107
+ # note it should handle the frame shift and direction!
108
+ aaseq.should == "GIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
109
+ end
110
+ it "should assemble 3 CDSs for MhA1_Contig1133.frz3.gene4" do
111
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene4']
112
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene4']
113
+ seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>false)
114
+ seq.size.should == 543
115
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
116
+ seq = @gff.assemble(@contigsequence,component.start,recs)
117
+ seq.size.should == 543
118
+ seq.should == "ATGCGTCCTTTAACAGATGAAGAAACTGAAAAGTTTTTCAAAAAACTTTCAAATTATATTGGTGACAATATTAAACTTTTATTGGAAAGAGAAGATGGAGAATATGTTTTTCGTTTACATAAAGACAGAGTTTATTATTGCAGTGAAAAATTAATGCGACAAGCAGCATGTATTGGACGTAAACAATTGGGATCTTTTGGAACTTGTTTGGGTAAATTCACAAAAGGAGGGTCTTTCTTTCTTCATATAACATCATTGGATTATTTGGCACCTTATGCTTTAGCAAAAATTTGGTTAAAACCACAAGCTGAACAACAATTTTTATATGGAAATAATATTGTTAAATCTGGTGTTGGAAGAATGAGTGAAGGGATTGAAGAAAAACAAGGTATTATTATTTATAATATGTCAGATTTACCATTGGGTTTTGGAGTGGCTGCAAAGGGAACATTATCTTGTAGAAAAGTAGATCCTACAGCTTTAGTTGTTTTACATCAATCAGATTTGGGTGAATATATTCGAAATGAAGAGGGATTAATTTAA"
119
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
120
+ aaseq.should == "MRPLTDEETEKFFKKLSNYIGDNIKLLLEREDGEYVFRLHKDRVYYCSEKLMRQAACIGRKQLGSFGTCLGKFTKGGSFFLHITSLDYLAPYALAKIWLKPQAEQQFLYGNNIVKSGVGRMSEGIEEKQGIIIYNMSDLPLGFGVAAKGTLSCRKVDPTALVVLHQSDLGEYIRNEEGLI*"
121
+ end
122
+ # > class=Sequence position=MhA1_Contig1133:27463..29904 (- strand); shown in frame 1!
123
+ # ATGGACCATC ATGCATTGGT GGAGGAATTA CCAGAAATTG AAAAATTAAC TCCTCAAGAA CGTATTGCAT TAGCTAGAGA
124
+ # ACGCCGTGCT GAACAACTTC GACAGAATGC TGCACGGGAG GCTCAATTGC CAATGCCTGC ACAGCGCCGG CCTCGTCTTC
125
+ # GATTTACACC AGATGTTGCT TTACTTGAGG CAACAGTTAG GGGTGATACC CAAGAAGGTT ATACATAAAG ATTATTGATT
126
+ # TTAAATGAAT TTATTTATTT TTTAGTTGAA AGACTTTTAA TGGAAGGTGT CAATGCTGAT TCACATAATG AGGATGGATT
127
+ # AACACCTTTA CATCAGGCAA AAACCAAATT AATTTTTTTA AATTTATTTT TAGTGTGCCA TTGACAATAA TGAAAGAATT
128
+ # GTTCGTCTTC TGCTTAGGTA CGGAGCTTGT GTTAATGCCA AAGACACTGA ACTTTGGACA CCATTGCACG CAGCTGCATG
129
+ # TTGTGCTTAT ATTGATATTG TTCGATTGCT TATTGCACAG TTAGTTTTTT TTTAATTTTT TTTTTAAATA AATTTCTTAA
130
+ # GTTTTACAGA AATATTTATT TTAAACAAAC GGGACTTCCT TTTAAATTTT TTGTATTTTT AATCTTTACG TATTTTCATT
131
+ # TAATAATTAA TTCGTCTTCT AAAAGTTCGT AAGTTTTGTG GTTTAGTTTA ATGGGTAAAC ATCCAGTTTT TAGGTCATCG
132
+ # ATTTTTATTT TTGCGTCATA TTTTATCGAA AACTTCTTTC ATATTAAAAA TTTCTTTTTA AGCAACGCAG ATTTACTAGC
133
+ # AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG
134
+ # CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA
135
+ # CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT
136
+ # TTATGTAAAG ATTATTATGA AAGGATTATT ACAGTTTTAT TCCTTTTTAG TTACATATAG CAGCAGCTAA TGGTTATTAT
137
+ # GATGTTGCTG CTTTCCTTCT TCGTTGTAAT GTTTCTCCAG CATTGAGAGA TATAGATTTG TGGCAACCAA TTCATGCAGC
138
+ # TGCTTCTTGG AATCAACCAG ACTTAATCGA GCTTTTATGC GAATATGGGG CTGATATAAA TGCAAAAACT GGAGCTGGGG
139
+ # AAAGCCCTTT AGGTTTATTT TATTGAATCT TATAATTTAT AAATATTTGC TATTAAGTAT GAGGGGAGAG GAACTAACAA
140
+ # TAAGGAATTA AATTTCTCAA TATCAGGATT TTTCGGTTCA CACCCATTTT CTTAAGACCT TTAATTTTTC TCAAAATATG
141
+ # TATGTGACCA CGTCGGGAGG CTTTTTTATT TTTACATGGC TATTTTAAGA AAGGCTAGAA TTTTGACATA CTTTTAACTT
142
+ # ATCGCCTTCC TAACTATTTT CTGTCTATAT ATTTTTTTAA ATTAAGAATT AACTGAAGAT GAACCAACCC AACAAGTAAT
143
+ # TAGAACAATC GCTCAGACAG AAGCAAGGAG ACGGCGTGGT CCAGGTGGTG GTTACTTTGG TGTTCGTGAT TCTCGACGAC
144
+ # AAAGCCGAAA GTAATTTTAA ATTTATATTT TCTTTTCATC TTTTTATCTA GAAGAAAAAA GTTTGAATCT CCTCAACAAC
145
+ # CACCTTCAAC ATTAGAAAAT CCTTTCTCAG CTAGAGGTGC AATTAGACGA CAATCATTGC GAGATCGTAG TGGAATGTCA
146
+ # TTAGCTCGTT TGGAAGCACA AAGAGAGGGT TCTGACCTTA TTAGAAGTTA TAATAGTAAA GAAGACCTTT CTTCTAATAC
147
+ # AGCGGTTTGT TTTTTAAAAT TGTAATTTTT TCTTAATTTT TAGGATGATT CTTTAAATGT TGGAAGTTCT TCATATCTCA
148
+ # ACAATCCAAC AGCCTCGGCT AGTGCTTCCT CTTCAGCATT ACACGGAACT CCACATCAAC AACAACGTCG TGAATCTCCA
149
+ # CCTAAACGTG CATTAATGGC TAGAAGTGCT TCTCATCAAA AACAAAAACA ACAAATGTCT CCAGATGAAT GGCTGAAAAA
150
+ # ATTAGAAGCA GATTCTGCAG GTTTTCGAGA TAATGATGGA GAAGATGGTG AATTACAATC TGAACTTAAA GGAGGACAAA
151
+ # GAATGAAGAG TGGTGGTGGT GGAGGAGCGA GAGGTCAGCA AGGTGAATTA AAATATTTTT TTTGAATTTT ATATTTATTT
152
+ # TTCGTTTAAT AGAAATGAAT GGTGGTCCAA CAGCAACATT TGGTGGAGCT TCAAAACAAC AATTAGCAAT GGGCTCTGGA
153
+ # CCCAATAGAC GGCGCAAACA AGGATGTTGC TCTGTTTTGT GA
154
+ it "should assemble a reverse CDS in MhA1_Contig1133.frz3.gene11" do
155
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
156
+ recs.size.should == 8
157
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
158
+ # 193 bp from MhA1_Contig1133:27,981..28,173
159
+ # >MhA1_Contig1133:27981..28173
160
+ # cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
161
+ # tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
162
+ # aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
163
+ # aaacttttttctt
164
+ cds1 = recs[5]
165
+ cds1.start.should == 27981
166
+ cds1.frame.should == 1
167
+ cds1.strand.should == '-'
168
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true)
169
+ seq.should == "TCTTTTTTCAAACTTAGAGGAGTTGTTGGTGGAAGTTGTAATCTTTTAGGAAAGAGTCGATCTCCACGTTAATCTGCTGTTAGTAACGCTCTAGCATCACCTTACAGTAATCGAGCAAACCTTCGTGTTTCTCTCCCAAGACTGGAATAATCTTCAATATTATCATTTCTTCTGGAAAGAAGATTATGTCGC"
170
+ seq.size.should == 192
171
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:phase=>true,:reverse=>true,:complement=>true)
172
+ seq.should == "AGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCG"
173
+ seq.size.should == 192
174
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
175
+ # note it should handle the frame shift and direction!
176
+ # >EMBOSS_001_4
177
+ # RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
178
+ aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
179
+ end
180
+ it "should take the 6th CDS in MhA1_Contig1133.frz3.gene11 (which is 3rd on DNA)" do
181
+ # >MhA1_Contig1133:27981..28173
182
+ # cgctgtattagaagaaaggtcttctttactattataacttctaataaggtcagaaccctc
183
+ # tctttgtgcttccaaacgagctaatgacattccactacgatctcgcaatgattgtcgtct
184
+ # aattgcacctctagctgagaaaggattttctaatgttgaaggtggttgttgaggagattc
185
+ # aaacttttttctt
186
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
187
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
188
+ cds2 = recs[5].clone
189
+ # p cds2
190
+ cds2.start.should == 27981
191
+ cds2.frame.should == 1
192
+ cds2.strand.should == '-'
193
+ seq = @gff.assemble(@contigsequence,component.start,[cds2],:complement=>true)
194
+ seq.should == "GCGACATAATCTTCTTTCCAGAAGAAATGATAATATTGAAGATTATTCCAGTCTTGGGAGAGAAACACGAAGGTTTGCTCGATTACTGTAAGGTGATGCTAGAGCGTTACTAACAGCAGATTAACGTGGAGATCGACTCTTTCCTAAAAGATTACAACTTCCACCAACAACTCCTCTAAGTTTGAAAAAAGAA"
195
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
196
+ # note it should handle the frame shift and direction!
197
+ # >27981..28173_4 RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA
198
+ aaseq.should == "RKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTA"
199
+ end
200
+ it "should assemble the 1st reverse CDS in MhA1_Contig1133.frz3.gene11" do
201
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
202
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
203
+ cds1 = recs[0].clone
204
+ cds1.start.should == 29710
205
+ cds1.frame.should == 0
206
+ cds1.strand.should == '-'
207
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:raw=>true)
208
+ seq.size.should == 195
209
+ seq.should == "TGTTGCCTCAAGTAAAGCAACATCTGGTGTAAATCGAAGACGAGGCCGGCGCTGTGCAGGCATTGGCAATTGAGCCTCCCGTGCAGCATTCTGTCGAAGTTGTTCAGCACGGCGTTCTCTAGCTAATGCAATACGTTCTTGAGGAGTTAATTTTTCAATTTCTGGTAATTCCTCCACCAATGCATGATGGTCCAT"
210
+ seq = @gff.assemble(@contigsequence,component.start,[cds1],:codonize=>true)
211
+ seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACA"
212
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds1])
213
+ aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEAT"
214
+ end
215
+ it "should assemble the 3rd reverse CDS in MhA1_Contig1133.frz3.gene11" do
216
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
217
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
218
+ cds2 = recs[2].clone
219
+ # CAACGCAG ATTTACTAGC AGTAAATGCA GATGGTAATA TGCCTTATGA TATTTGTGAT GATGAACAAA CCCTTGACCT TATTGAATCT GAAATGGCTG CTAGAGGAAT TACACAAGAA ATGATTGATG AAAGAAGACA ACAACCAGAA AGGGAAATGT TAAATGATAT GAAAATTTTA CATCAAAGAG GATTACCTTT AGATCAAAGA AATTCTGTTG ATAAATCTAC TTTTGTAAGT TTTTCTGGAG AAAGGGAAAT TTAT
220
+ # p cds2
221
+ cds2.frame.should == 1
222
+ cds2.strand.should == '-'
223
+ seq = @gff.assemble(@contigsequence,component.start,[cds2], :raw=>true)
224
+ seq.should == "ATAAATTTCCCTTTCTCCAGAAAAACTTACAAAAGTAGATTTATCAACAGAATTTCTTTGATCTAAAGGTAATCCTCTTTGATGTAAAATTTTCATATCATTTAACATTTCCCTTTCTGGTTGTTGTCTTCTTTCATCAATCATTTCTTGTGTAATTCCTCTAGCAGCCATTTCAGATTCAATAAGGTCAAGGGTTTGTTCATCATCACAAATATCATAAGGCATATTACCATCTGCATTTACTGCTAGTAAATCTGCGTTG"
225
+ seq = @gff.assemble(@contigsequence,component.start,[cds2], :codonize=>true)
226
+ seq.should == "AACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTAT"
227
+ # cds1.frame = 1
228
+ aaseq = @gff.assembleAA(@contigsequence,component.start,[cds2])
229
+ # note it should handle the frame shift and direction!
230
+ aaseq.should == "NADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIY"
231
+ end
232
+ it "should assemble the protein sequence for MhA1_Contig1133.frz3.gene11" do
233
+ recs = @cdslist['cds:MhA1_Contig1133.frz3.gene11']
234
+ component = @componentlist['cds:MhA1_Contig1133.frz3.gene11']
235
+ seq = @gff.assemble(@contigsequence,component.start,recs, :phase=>true, :reverse=>true, :complement=>true)
236
+ seq.should == "ATGGACCATCATGCATTGGTGGAGGAATTACCAGAAATTGAAAAATTAACTCCTCAAGAACGTATTGCATTAGCTAGAGAACGCCGTGCTGAACAACTTCGACAGAATGCTGCACGGGAGGCTCAATTGCCAATGCCTGCACAGCGCCGGCCTCGTCTTCGATTTACACCAGATGTTGCTTTACTTGAGGCAACATGTGCCATTGACAATAATGAAAGAATTGTTCGTCTTCTGCTTAGGTACGGAGCTTGTGTTAATGCCAAAGACACTGAACTTTGGACACCATTGCACGCAGCTGCATGTTGTGCTTATATTGATATTGTTCGATTGCTTATTGCACACAACGCAGATTTACTAGCAGTAAATGCAGATGGTAATATGCCTTATGATATTTGTGATGATGAACAAACCCTTGACCTTATTGAATCTGAAATGGCTGCTAGAGGAATTACACAAGAAATGATTGATGAAAGAAGACAACAACCAGAAAGGGAAATGTTAAATGATATGAAAATTTTACATCAAAGAGGATTACCTTTAGATCAAAGAAATTCTGTTGATAAATCTACTTTTGTAAGTTTTTCTGGAGAAAGGGAAATTTATTTACATATAGCAGCAGCTAATGGTTATTATGATGTTGCTGCTTTCCTTCTTCGTTGTAATGTTTCTCCAGCATTGAGAGATATAGATTTGTGGCAACCAATTCATGCAGCTGCTTCTTGGAATCAACCAGACTTAATCGAGCTTTTATGCGAATATGGGGCTGATATAAATGCAAAAACTGGAGCTGGGGAAAGCCCTTTAGAATTAACTGAAGATGAACCAACCCAACAAGTAATTAGAACAATCGCTCAGACAGAAGCAAGGAGACGGCGTGGTCCAGGTGGTGGTTACTTTGGTGTTCGTGATTCTCGACGACAAAGCCGAAAAAGAAAAAAGTTTGAATCTCCTCAACAACCACCTTCAACATTAGAAAATCCTTTCTCAGCTAGAGGTGCAATTAGACGACAATCATTGCGAGATCGTAGTGGAATGTCATTAGCTCGTTTGGAAGCACAAAGAGAGGGTTCTGACCTTATTAGAAGTTATAATAGTAAAGAAGACCTTTCTTCTAATACAGCGGATGATTCTTTAAATGTTGGAAGTTCTTCATATCTCAACAATCCAACAGCCTCGGCTAGTGCTTCCTCTTCAGCATTACACGGAACTCCACATCAACAACAACGTCGTGAATCTCCACCTAAACGTGCATTAATGGCTAGAAGTGCTTCTCATCAAAAACAAAAACAACAAATGTCTCCAGATGAATGGCTGAAAAAATTAGAAGCAGATTCTGCAGGTTTTCGAGATAATGATGGAGAAGATGGTGAATTACAATCTGAACTTAAAGGAGGACAAAGAATGAAGAGTGGTGGTGGTGGAGGAGCGAGAGGTCAGCAAGAAATGAATGGTGGTCCAACAGCAACATTTGGTGGAGCTTCAAAACAACAATTAGCAATGGGCTCTGGACCCAATAGACGGCGCAAACAAGGATGTTGCTCTGTTTTGTGA"
237
+ aaseq = @gff.assembleAA(@contigsequence,component.start,recs)
238
+ aaseq.should == "MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVALLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLAVNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPLDQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAAASWNQPDLIELLCEYGADINAKTGAGESPLELTEDEPTQQVIRTIAQTEARRRRGPGGGYFGVRDSRRQSRKRKKFESPQQPPSTLENPFSARGAIRRQSLRDRSGMSLARLEAQREGSDLIRSYNSKEDLSSNTADDSLNVGSSSYLNNPTASASASSSALHGTPHQQQRRESPPKRALMARSASHQKQKQQMSPDEWLKKLEADSAGFRDNDGEDGELQSELKGGQRMKSGGGGGARGQQEMNGGPTATFGGASKQQLAMGSGPNRRRKQGCCSVL*"
239
+ # >EMBOSS_001_1
240
+ # MDHHALVEELPEIEKLTPQERIALARERRAEQLRQNAAREAQLPMPAQRRPRLRFTPDVA
241
+ # LLEATCAIDNNERIVRLLLRYGACVNAKDTELWTPLHAAACCAYIDIVRLLIAHNADLLA
242
+ # VNADGNMPYDICDDEQTLDLIESEMAARGITQEMIDERRQQPEREMLNDMKILHQRGLPL
243
+ # DQRNSVDKSTFVSFSGEREIYLHIAAANGYYDVAAFLLRCNVSPALRDIDLWQPIHAA...
244
+
245
+ end
246
+ it "should assemble exons into"
247
+ it "should assemble the gene into"
248
+ # >MhA1_Contig1133:27463..29904
249
+ # tcacaaaacagagcaacatccttgtttgcgccgtctattgggtccagagcccattgctaa
250
+ # ttgttgttttgaagctccaccaaatgttgctgttggaccaccattcatttctattaaacg
251
+ # aaaaataaatataaaattcaaaaaaaatattttaattcaccttgctgacctctcgctcct
252
+ # ccaccaccaccactcttcattctttgtcctcctttaagttcagattgtaattcaccatct
253
+ # tctccatcattatctcgaaaacctgcagaatctgcttctaattttttcagccattcatct
254
+ # ggagacatttgttgtttttgtttttgatgagaagcacttctagccattaatgcacgttta
255
+ # ggtggagattcacgacgttgttgttgatgtggagttccgtgtaatgctgaagaggaagca
256
+ # ctagccgaggctgttggattgttgagatatgaagaacttccaacatttaaagaatcatcc
257
+ # taaaaattaagaaaaaattacaattttaaaaaacaaaccgctgtattagaagaaaggtct
258
+ # tctttactattataacttctaataaggtcagaaccctctctttgtgcttccaaacgagct
259
+ # aatgacattccactacgatctcgcaatgattgtcgtctaattgcacctctagctgagaaa
260
+ # ggattttctaatgttgaaggtggttgttgaggagattcaaacttttttcttctagataaa
261
+ # aagatgaaaagaaaatataaatttaaaattactttcggctttgtcgtcgagaatcacgaa
262
+ # caccaaagtaaccaccacctggaccacgccgtctccttgcttctgtctgagcgattgttc
263
+ # taattacttgttgggttggttcatcttcagttaattcttaatttaaaaaaatatatagac
264
+ # agaaaatagttaggaaggcgataagttaaaagtatgtcaaaattctagcctttcttaaaa
265
+ # tagccatgtaaaaataaaaaagcctcccgacgtggtcacatacatattttgagaaaaatt
266
+ # aaaggtcttaagaaaatgggtgtgaaccgaaaaatcctgatattgagaaatttaattcct
267
+ # tattgttagttcctctcccctcatacttaatagcaaatatttataaattataagattcaa
268
+ # taaaataaacctaaagggctttccccagctccagtttttgcatttatatcagccccatat
269
+ # tcgcataaaagctcgattaagtctggttgattccaagaagcagctgcatgaattggttgc
270
+ # cacaaatctatatctctcaatgctggagaaacattacaacgaagaaggaaagcagcaaca
271
+ # tcataataaccattagctgctgctatatgtaactaaaaaggaataaaactgtaataatcc
272
+ # tttcataataatctttacataaatttccctttctccagaaaaacttacaaaagtagattt
273
+ # atcaacagaatttctttgatctaaaggtaatcctctttgatgtaaaattttcatatcatt
274
+ # taacatttccctttctggttgttgtcttctttcatcaatcatttcttgtgtaattcctct
275
+ # agcagccatttcagattcaataaggtcaagggtttgttcatcatcacaaatatcataagg
276
+ # catattaccatctgcatttactgctagtaaatctgcgttgcttaaaaagaaatttttaat
277
+ # atgaaagaagttttcgataaaatatgacgcaaaaataaaaatcgatgacctaaaaactgg
278
+ # atgtttacccattaaactaaaccacaaaacttacgaacttttagaagacgaattaattat
279
+ # taaatgaaaatacgtaaagattaaaaatacaaaaaatttaaaaggaagtcccgtttgttt
280
+ # aaaataaatatttctgtaaaacttaagaaatttatttaaaaaaaaaattaaaaaaaaact
281
+ # aactgtgcaataagcaatcgaacaatatcaatataagcacaacatgcagctgcgtgcaat
282
+ # ggtgtccaaagttcagtgtctttggcattaacacaagctccgtacctaagcagaagacga
283
+ # acaattctttcattattgtcaatggcacactaaaaataaatttaaaaaaattaatttggt
284
+ # ttttgcctgatgtaaaggtgttaatccatcctcattatgtgaatcagcattgacaccttc
285
+ # cattaaaagtctttcaactaaaaaataaataaattcatttaaaatcaataatctttatgt
286
+ # ataaccttcttgggtatcacccctaactgttgcctcaagtaaagcaacatctggtgtaaa
287
+ # tcgaagacgaggccggcgctgtgcaggcattggcaattgagcctcccgtgcagcattctg
288
+ # tcgaagttgttcagcacggcgttctctagctaatgcaatacgttcttgaggagttaattt
289
+ # ttcaatttctggtaattcctccaccaatgcatgatggtccat
290
+ end
291
+