bio-bigbio 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ # ORF predictor class
2
+ #
3
+
4
+ require 'bigbio/sequence/translate'
5
+
6
+ class ORFsequence
7
+ attr_accessor :seq
8
+ def initialize sequence
9
+ @seq = sequence
10
+ end
11
+ end
12
+
13
+ # Helper class for storing ORF information
14
+ class ORFnucleotides < ORFsequence
15
+ attr_reader :start, :stop
16
+ def initialize sequence, start, stop
17
+ super(sequence)
18
+ @start = start
19
+ @stop = stop
20
+ end
21
+
22
+ def seq
23
+ @seq[@start..@stop-1]
24
+ end
25
+
26
+ def fullseq
27
+ @seq
28
+ end
29
+
30
+ def to_s
31
+ seq
32
+ end
33
+
34
+ end
35
+
36
+ # Helper class for storing ORF information
37
+ class ORFaminoacids < ORFsequence
38
+ end
39
+
40
+ class ORF
41
+ attr_reader :id, :descr, :nt, :aa, :frame
42
+ def initialize num, type, id, descr, nt, frame, start, aa
43
+ @id = id.to_s + '_' + (num + 1).to_s
44
+ # ---- adjust start to match frame
45
+ start += frame.abs-1
46
+ # ---- stop should not go beyond sequence
47
+ stop = start + aa.size * 3
48
+ if stop > nt.size
49
+ stop = nt.size
50
+ end
51
+ # ---- if frame < 0 it should reverse complement
52
+ if frame < 0
53
+ nt = Bio::Sequence::NA.new(nt).reverse_complement.to_s.upcase
54
+ end
55
+ # p [start, stop, stop-start]
56
+ # p nt
57
+ fr = frame.to_s
58
+ fr = '+'+fr if frame > 0
59
+ @descr = "[#{type} #{fr} #{start} - #{stop}; #{stop-start}/#{nt.size}] " + descr
60
+ @nt = ORFnucleotides.new(nt, start, stop)
61
+ @frame = frame
62
+ @aa = ORFaminoacids.new(aa)
63
+ end
64
+
65
+ def <=> other
66
+ if frame == other.frame
67
+ nt.seq <=> other.nt.seq
68
+ else
69
+ frame <=> other.frame
70
+ end
71
+ end
72
+
73
+ def to_fastarec
74
+ aa = FastaRecord.new(@id,@descr,@aa.seq)
75
+ nt = FastaRecord.new(@id,@descr,@nt.seq)
76
+ FastaPairedRecord.new(nt,aa)
77
+ end
78
+ end
79
+
80
+ class PredictORF
81
+
82
+ include Bio::Big::FrameCodonHelpers
83
+
84
+ def initialize id, descr, seq, trn_table
85
+ @id = id
86
+ @descr = descr
87
+ @seq = seq.gsub(/\s/,'')
88
+ @trn_table = trn_table
89
+ @startcodons = # FIXME: this should be linked properly
90
+ if trn_table == nil or trn_table == 0
91
+ START_CODONS
92
+ else # prokaryote
93
+ ['ATG','TTG','CTG','AUG','UUG','CUG']
94
+ end
95
+ end
96
+
97
+ # Return a list of predicted ORFs with :minsize AA's. The ORF's
98
+ # are between STOP codons (so sequences without a proper START codon
99
+ # are included)
100
+ def stopstop minsize=30
101
+ type = "XX"
102
+ orfs = []
103
+ translate = Nucleotide::Translate.new(@trn_table)
104
+ aa_frames = translate.aa_6_frames(@seq)
105
+ # p @seq
106
+ # pp aa_frames
107
+ num = 0
108
+ aa_frames.each do | aa_frame |
109
+ frame = aa_frame[:frame]
110
+ aa = aa_frame[:sequence]
111
+ aa_start = 0
112
+ aa.split(/\*/).each do | candidate |
113
+ if candidate.size >= minsize and candidate.size > 0
114
+ orf = ORF.new(num,type,@id,@descr,@seq,frame,aa_start*3,candidate)
115
+ orfs.push orf
116
+ num += 1
117
+ end
118
+ aa_start += candidate.size + 1
119
+ end
120
+ end
121
+ orfs
122
+ end
123
+
124
+ # Return a list of predicted ORFs with :minsize AA's. The ORF's
125
+ # are between START and STOP codons (ATG, TTG, CTG and AUG, UUG and CUG for
126
+ # now, a later version should use the EMBOSS translation table).
127
+ def startstop minsize=30
128
+ stopstop(minsize).find_all { | orf |
129
+ # p [orf.nt.seq[0..2].upcase,@startcodons.include?(orf.nt.seq[0..2].upcase)]
130
+ @startcodons.include?(orf.nt.seq[0..2].upcase)
131
+ }
132
+ end
133
+
134
+ # Return the longest ORF that has a START codon (see +startstop+)
135
+ # Returns nil if none is found
136
+ def longest_startstop minsize=0
137
+ startstop(minsize).first
138
+ end
139
+
140
+ end
@@ -0,0 +1,52 @@
1
+
2
+ module Nucleotide
3
+
4
+ module TranslationTable
5
+ end
6
+
7
+ class Translate
8
+
9
+ include Bio::Big::TranslationAdapter
10
+
11
+ # Table can be either an id (integer) or a Biolib::Emboss TrnTable
12
+
13
+ def initialize table
14
+ table = 0 if table == nil
15
+ if table.kind_of? Numeric
16
+ @trn_table = Bio::Big::TranslationAdapter.translation_table(table)
17
+ else
18
+ @trn_table = table
19
+ end
20
+ end
21
+
22
+ # Return all six reading frames as an Array - ordered as
23
+ # frames [1,2,3,-1,-2,-3] with as tuples [frame, AAsequence].
24
+ #
25
+ # Note that the nucleotide sequence does not get modified.
26
+
27
+ def aa_6_frames seq
28
+ res = []
29
+ # remove white space
30
+ seq = seq.gsub(/\s/,'')
31
+ [1,2,3,-1,-2,-3].each do | frame |
32
+ aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
33
+ res.push({:frame => frame, :sequence => aa})
34
+ end
35
+ res
36
+ end
37
+ # Return all forward reading frames as an Array - ordered as
38
+ # frames [1,2,3] with as tuples [frame, AAsequence]
39
+
40
+ def aa_forward_frames seq
41
+ res = []
42
+ # remove white space
43
+ seq = seq.gsub(/\s/,'')
44
+ [1,2,3].each do | frame |
45
+ aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
46
+ res.push({:frame => frame, :sequence => aa})
47
+ end
48
+ res
49
+ end
50
+ end
51
+ end
52
+
data/lib/bigbio.rb ADDED
@@ -0,0 +1,38 @@
1
+ # BigBio libraries
2
+
3
+ require 'bigbio/environment'
4
+
5
+ # find local plugin installation, and use it when there
6
+ rootpath = File.dirname(File.dirname(__FILE__))
7
+ bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
8
+ if File.directory? bio_logger_path
9
+ $: << bio_logger_path
10
+ $stderr.print "bio-logger loaded directly\n"
11
+ else
12
+ require "rubygems"
13
+ gem "bio-logger"
14
+ end
15
+ require 'bio-logger'
16
+
17
+ log = Bio::Log::LoggerPlus.new('bigbio')
18
+ Bio::Big::Environment.instance.log = log
19
+
20
+ begin
21
+ require 'biolib/emboss'
22
+ Bio::Big::Environment.instance.biolib = true
23
+ rescue LoadError
24
+ log.outputters = Bio::Log::Outputter.stderr
25
+ log.warn "BioLib functionality not loaded"
26
+ end
27
+
28
+ require 'bio'
29
+ require 'bigbio/adapters/translate'
30
+ require 'bigbio/db/emitters/fasta_emitter'
31
+ require 'bigbio/db/emitters/orf_emitter'
32
+
33
+ autoload :FastaReader, 'bigbio/db/fasta'
34
+ autoload :FastaWriter, 'bigbio/db/fasta'
35
+ autoload :FastaPairedReader, 'bigbio/db/fasta'
36
+ autoload :FastaPairedWriter, 'bigbio/db/fasta'
37
+ autoload :BlastClust, 'bigbio/db/blast'
38
+ autoload :PredictORF, 'bigbio/sequence/predictorf'
@@ -0,0 +1,265 @@
1
+
2
+ require 'rspec'
3
+
4
+ $: << "../lib"
5
+
6
+ require 'bigbio'
7
+
8
+ describe Bio::Big::FastaEmitter, "when using the emitter" do
9
+ include Bio::Big
10
+
11
+ it "should emit small parts" do
12
+ s = ""
13
+ FastaEmitter.new("test/data/fasta/nt.fa",10).emit_seq do | part, index, tag, seq |
14
+ # p [index, part, tag, seq]
15
+ s += seq
16
+ if index == 95 and part == :tail
17
+ s.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
18
+ end
19
+ s = "" if part == :tail
20
+ end
21
+ end
22
+
23
+ it "should emit large parts" do
24
+ FastaEmitter.new("test/data/fasta/nt.fa").emit_seq do | part, index, tag, seq |
25
+ # p [index, part, tag, seq]
26
+ if index == 95
27
+ seq.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ describe Bio::Big::ShortFrameState, "when using the ShortFrameState" do
34
+
35
+ include Bio::Big
36
+
37
+ it "should find an ORF" do
38
+ fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,0
39
+ orfs = fr.get_stopstop_orfs
40
+ orfs.map{ | orf | orf.pos }.should == [ 3, 5 ]
41
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
42
+ orfs = fr.get_startstop_orfs
43
+ orfs.map{ | orf | orf.pos }.should == [ 0, 3 ]
44
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTAA","ATGTAA"]
45
+ end
46
+ it "should handle min_size" do
47
+ fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,9
48
+ orfs = fr.get_stopstop_orfs
49
+ orfs.map{ | orf | orf.to_seq }.should == [ "TGGATTTAA"]
50
+ orfs.map{ | orf | orf.pos }.should == [ 5 ]
51
+ fr.get_startstop_orfs.should == []
52
+ end
53
+ it "should find ORFs in" do
54
+ fr = ShortFrameState.new "atgttttaaatgtaatgttgttaaatgttttaaatgtaatgttgttaa",0,0
55
+ orfs = fr.get_stopstop_orfs
56
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGTTGTTAA", "ATGTTTTAA", "ATGTAA", "TGTTGTTAA"]
57
+ orfs.map{ | orf | orf.pos }.should == [3, 5, 8, 11, 13]
58
+ orfs = fr.get_startstop_orfs
59
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTTTTAA", "ATGTAA", "ATGTTTTAA", "ATGTAA"]
60
+ orfs.map{ | orf | orf.pos }.should == [ 0, 3, 8, 11]
61
+ end
62
+
63
+ it "should match results of EMBOSS getorf" do
64
+ s = "AG GTTCGNACGGTCATCGNATNAAGTCTTGNATATCG TAA TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTG GGG AAAACTTTG TGA GCAAAGAGCGAGAAAATGAGCGGANCGG TAA GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC TAA GGTCCTTGTCACAGATGAGGCTCGTAGAG".gsub(/ /,'')
65
+ # >_3 [3 - 167] #0
66
+ # 1st GTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTTCT
67
+ # CCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCA
68
+ # GTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAA
69
+ # >_4 [171 - 179] #0
70
+ # OK GCGGANCGG
71
+ # >_5 [183 - 239] #0
72
+ # OK GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC
73
+ # >_6 [243 - 257] #0
74
+ # OK- GGTCCTTGTCACAGA
75
+ # >_7 [261 - 266] #0
76
+ # OK- GGCTCGTAG
77
+ # >_8 [1 - 270] # 1
78
+ # whole! AGGTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTT
79
+ # CTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGT
80
+ # CAGTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAATGAGCGGANCGGT
81
+ # AAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCT
82
+ # AAGGTCCTTGTCACAGATGAGGCTCGTAGA
83
+ # >_1 [2 - 37] #2
84
+ # 1st- GGTTCGNACGGTCATCGNATNAAGTCTTGNATATCG
85
+ # >_2 [41 - 148] #2
86
+ # OK- TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCT
87
+ # TCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTG
88
+ # >_9 [152 - 271] #2
89
+ # last- CAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTT
90
+ # CAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
91
+
92
+ # Frame 0
93
+ minsize = 0
94
+ fr = ShortFrameState.new s,0,minsize
95
+ orfs = fr.get_stopstop_orfs
96
+ orfs.map{ | orf | orf.to_seq }.should == []
97
+
98
+ # Frame 1
99
+ fr = ShortFrameState.new s[1..-1],0,minsize
100
+ os = fr.get_stopstop_orfs
101
+ os.map{ | orf | orf.to_seq }.should == ["TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"]
102
+ orfs += os
103
+ # Frame 2
104
+ fr = ShortFrameState.new s[2..-1],0,minsize
105
+ os = fr.get_stopstop_orfs
106
+ os.map{ | orf | orf.to_seq }.should == ["GCGGANCGGTAA", "GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAA", "GGTCCTTGTCACAGATGA", "GGCTCGTAG"]
107
+ orfs += os
108
+ orfs.size.should == 5
109
+
110
+ # >_1 [235 - 270]
111
+ # Last: ATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGA
112
+ # >_2 [167 - 271]
113
+ # Last: ATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATT
114
+ # GGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
115
+
116
+ # Frame 0
117
+ minsize = 0
118
+ fr = ShortFrameState.new s,0,minsize
119
+ orfs = fr.get_startstop_orfs
120
+
121
+ # Frame 1
122
+ fr = ShortFrameState.new s[1..-1],0,minsize
123
+ orfs += fr.get_startstop_orfs
124
+ # Frame 2
125
+ fr = ShortFrameState.new s[2..-1],0,minsize
126
+ orfs += fr.get_startstop_orfs
127
+ orfs.map{ | orf | orf.to_seq }.should == []
128
+ orfs.size.should == 0
129
+ end
130
+ end
131
+
132
+ describe Bio::Big::ShortFrameState, "when combining frames" do
133
+ include Bio::Big
134
+ it "should combine a forward frame" do
135
+ s1 = "atggattaaatgtaata"
136
+ s2 = "atggatttaatgtaaa"
137
+ fr = ShortFrameState.new s1,0,0
138
+ fr.ntseq_pos.should == 0
139
+ orfs = fr.get_stopstop_orfs
140
+ orfs.size == 1 # in codons
141
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
142
+ fr3.ntseq_pos.should == 15
143
+ fr3.codons.to_seq.should == "TAATGGATTTAATGTAAA"
144
+ norfs = fr3.get_stopstop_orfs
145
+ orfs += norfs
146
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
147
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,18]
148
+ end
149
+
150
+ it "should combine a forward frame without ORFs in first seq" do
151
+ s1 = "atggattaaatgta"
152
+ # ......---===xx
153
+ s2 = "atggatttaattattataaa"
154
+ # x======xxx======xxx.
155
+ fr = ShortFrameState.new s1,0,0
156
+ fr.ntseq_pos.should == 0
157
+ orfs = fr.get_stopstop_orfs
158
+ orfs.size == 0 # in codons
159
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
160
+ fr3.ntseq_pos.should == 0
161
+ fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATTATTATAA"
162
+ norfs = fr3.get_stopstop_orfs
163
+ orfs = norfs
164
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA","TGGATTTAA","TTATTATAA"]
165
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6,9+6+9]
166
+ end
167
+
168
+ it "should combine a forward frame without ORFs in first seq" do
169
+ s1 = "atggattaaatgta"
170
+ # ......---===xx
171
+ s2 = "atggatttaatgtaaa"
172
+ # x======xxx
173
+ fr = ShortFrameState.new s1,0,0
174
+ fr.ntseq_pos.should == 0
175
+ orfs = fr.get_stopstop_orfs
176
+ orfs.size == 0 # in codons
177
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
178
+ # p fr3
179
+ fr3.ntseq_pos.should == 0 # on the combined sequences
180
+ fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATGTAAA"
181
+ norfs = fr3.get_stopstop_orfs
182
+ orfs += norfs
183
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
184
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6]
185
+ end
186
+
187
+ it "should combine a reverse frame" do
188
+ # Reversed (real locations on contig):
189
+ #
190
+ # | 3 21 B |
191
+ # ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
192
+ # ...^--============xxx^=======xxx
193
+ # ^ ^
194
+ # Actual feed:
195
+ #
196
+ # s2= s1=
197
+ # 18 0 (ntseq_pos)
198
+ # "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
199
+ # ......xxx===== ~===xx^============--^...
200
+ # 0 1 2 3 0 1 2 3
201
+
202
+ s2 = "tatttaaatggatttaatgtaaatt"
203
+ # ~===xx^============--^...
204
+ s1 = "atggattaaatgta"
205
+ # ......xxx=====
206
+ # now move the other way, as sequences get emitted on the left
207
+ fr = ShortReversedFrameState.new s2,0,0
208
+ # p fr
209
+ fr.codons.to_seq.should == "ATTTAAATGGATTTAATGTAAATT"
210
+ fr.ntseq_pos.should == 0
211
+ orfs = fr.get_stopstop_orfs
212
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA"]
213
+ orfs.first.pos.should == 2 # in codons
214
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_left(fr,orfs,s1)
215
+ fr3.ntseq_pos.should == 18 # 6 codons
216
+ fr3.codons.to_seq.should == "ATGGATTAAATGTATATTTAA"
217
+ norfs = fr3.get_stopstop_orfs
218
+ orfs += norfs
219
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA", "ATGTATATTTAA"]
220
+ orfs.map{ | orf | orf.pos }.should == [2,3]
221
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [6,18+9]
222
+ end
223
+
224
+ end
225
+
226
+ describe Bio::Big::OrfEmitter, "when using the ORF emitter" do
227
+ include Bio::Big
228
+
229
+ it "should emit STOP-STOP ORFs in all frames" do
230
+ f = FastaEmitter.new("test/data/fasta/nt.fa")
231
+ seqs = []
232
+ OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
233
+ break if index != 0
234
+ if frame == 0 and index == 0 and pos == 39
235
+ seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
236
+ # p [frame,index, tag, pos, seq]
237
+ end
238
+ if index == 0
239
+ seqs.push seq
240
+ end
241
+ end
242
+ seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
243
+ end
244
+ it "should emit STOP-STOP ORFs in all frames using a shorter emitter" do
245
+ f = FastaEmitter.new("test/data/fasta/nt.fa",150)
246
+ seqs = []
247
+ OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
248
+ break if index != 0
249
+ if frame == 0 and index == 0 and pos == 39
250
+ seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
251
+ # p [frame,index, tag, pos, seq]
252
+ end
253
+ if index == 0
254
+ seqs.push seq
255
+ end
256
+ end
257
+ # seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
258
+ end
259
+ if false
260
+ it "should emit START-STOP ORFs in all frames"
261
+ it "should emit ORFs on any filter"
262
+ it "should emit ORFs using a minimum size"
263
+ it "should emit ORFs with adjoining sequences"
264
+ end
265
+ end
@@ -0,0 +1,199 @@
1
+
2
+ $: << "../lib"
3
+ ENV['DATA'] = '../test/data/EMBOSS'
4
+
5
+ require 'bigbio'
6
+
7
+ # Note that PredictORF, at this point, leaves trailing X's for the AA sequence
8
+
9
+ describe PredictORF, " when using a short simple nucleotide sequence" do
10
+ before :all do
11
+ # initialize
12
+ id = 'test'
13
+ descr = 'Test'
14
+ # sequence = 'AGCTGAATCTGGTAGATACCATCTTTAA'
15
+ sequence = 'AGCTGAATCTGG'
16
+ # trn_table = Biolib::Emboss.ajTrnNewI(1)
17
+ trn_table = Bio::Big::TranslationAdapter.translation_table(1)
18
+
19
+ @predictorf = PredictORF.new(id,descr,sequence,trn_table)
20
+ @orflist = @predictorf.stopstop(0)
21
+ # @orflist.each do | orf | p [orf.descr,orf] end
22
+ end
23
+
24
+ it "stopstop(0) should render six reading frames and seven ORF" do
25
+ # >EMBOSS_001_1
26
+ # S*IW
27
+ # >EMBOSS_001_2
28
+ # AESG
29
+ # >EMBOSS_001_3
30
+ # LNLX
31
+ # >EMBOSS_001_4
32
+ # PDSA
33
+ # >EMBOSS_001_5
34
+ # RFSX
35
+ # >EMBOSS_001_6
36
+ # QIQL
37
+ @orflist[0].aa.seq.should == "S"
38
+ @orflist[3].aa.seq[0..2].should == "LNL"
39
+ @orflist[4].aa.seq[0..2].should == "PDS"
40
+ @orflist.size.should == 7
41
+ end
42
+
43
+ # frame +1 - 4 codons S*IW
44
+ it "should give a first valid +1 frame" do
45
+ orf = @orflist[1]
46
+ orf.frame.should == 1
47
+ orf.nt.start.should == 6
48
+ orf.aa.seq.should == "IW"
49
+ orf.nt.seq.should == "ATCTGG"
50
+ end
51
+
52
+ # frame +1 - 4 codons S*IW
53
+ it "should give a second valid +1 frame" do
54
+ orf = @orflist[0]
55
+ orf.frame.should equal 1
56
+ orf.nt.start.should equal 0
57
+ orf.aa.seq.should == "S"
58
+ orf.nt.seq.should == "AGC"
59
+ end
60
+
61
+ # frame +2 - 3 codons AES
62
+ it "should give a valid +2 frame" do
63
+ orf = @orflist[2]
64
+ # pp @orflist
65
+ # pp orf
66
+ orf.nt.seq[0..8].should == "GCTGAATCT"
67
+ orf.frame.should == 2
68
+ orf.nt.start.should == 1
69
+ # orf.nt.stop.should == 12 - EMBOS differs
70
+ orf.aa.seq[0..2].should == "AES"
71
+ end
72
+
73
+ # frame +3 - 3 codons LNL
74
+ it "should give a valid +3 frame" do
75
+ orf = @orflist[3]
76
+ orf.frame.should == 3
77
+ orf.nt.start.should == 2
78
+ # orf.nt.stop.should == 12
79
+ orf.aa.seq[0..3].should == "LNL"
80
+ orf.nt.seq[0..9].should == "CTGAATCTG"
81
+ end
82
+
83
+ # frame -1 - 4 codons PDSA
84
+ it "should give a valid -1 frame" do
85
+ orf = @orflist[4]
86
+ orf.frame.should == -1
87
+ orf.nt.start.should == 0
88
+ orf.nt.stop.should == 12
89
+ orf.aa.seq.should == "PDSA"
90
+ orf.nt.seq.should == "CCAGATTCAGCT"
91
+ end
92
+
93
+ # frame -2 - 3 codons RFSX
94
+ it "should give a valid -3 frame" do
95
+ orf = @orflist[5]
96
+ orf.frame.should == -2
97
+ orf.nt.start.should == 1
98
+ # orf.nt.stop.should == 12
99
+ orf.aa.seq[0..2].should == "RFS"
100
+ orf.nt.seq[0..8].should == "CAGATTCAG"
101
+ end
102
+
103
+ # frame -3 - 3 codons QIQL
104
+ it "should give a valid -3 frame" do
105
+ orf = @orflist[6]
106
+ orf.frame.should == -3
107
+ orf.nt.start.should == 2
108
+ # orf.nt.stop.should == 12
109
+ orf.aa.seq[0..2].should == "QIQ"
110
+ orf.nt.seq[0..8].should == "AGATTCAGC"
111
+ end
112
+ end
113
+
114
+ describe PredictORF, " when using a more complicated nucleotide sequence" do
115
+ before :all do
116
+ # initialize
117
+ id = "PUT-157a-Arabidopsis_thaliana-126"
118
+ descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
119
+ sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
120
+ CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
121
+ TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
122
+ AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
123
+ CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
124
+ # @trn_table = Biolib::Emboss.ajTrnNewI(1)
125
+ @trn_table = Bio::Big::TranslationAdapter.translation_table(1)
126
+ @predictorf = PredictORF.new(id,descr,sequence,@trn_table)
127
+ @orflist = @predictorf.stopstop(0)
128
+ # @orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
129
+ # >EMBOSS_001_1
130
+ # IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSF
131
+ # S*IW*IPSLHIVCLSWASRCLSYLKIKGLDDAISFSSVKP
132
+ # >EMBOSS_001_2
133
+ # SLATPASSLSLQSSLLVDLVFSVQYQGLDLRWMRHQIQELFKELHRHP*LRFKRFQFSVS
134
+ # AESGRYHLYISYACHGLLDAFHT*RSKDLTMQ*ASRL*N
135
+ # >EMBOSS_001_3
136
+ # H*QHQLPLSRFKVHYSWISSSVYSIKGSICGG*DIRFRSFSKNCIDIRNFVSKDSNSQFQ
137
+ # LNLVDTIFTYRMLVMGF*MPFILKDQRT*RCNKLLVCKT
138
+ # >EMBOSS_001_4
139
+ # GFYRREAYCIVKSFDL*V*KASRSP*QAYDM*RWYLPDSAETENWNLLKRSYGCRCSSLK
140
+ # SS*I*CLIHRRSSP*YCTLKTRSTSSEL*SEREEAGVAND
141
+ # >EMBOSS_001_5
142
+ # VLQTRSLLHRQVL*SLSMKGI*KPMTSIRYVKMVSTRFS*N*ELESFETKLRMSMQFFEK
143
+ # LLNLMSHPPQIEPLILYTEDEIHE**TLKRERGSWCC**X
144
+ # >EMBOSS_001_6
145
+ # GFTDEKLIASSSPLIFKYERHLEAHDKHTICKDGIYQIQLKLRIGIF*NEVTDVDAVL*K
146
+ # APESDVSSTADRALDTVH*RRDPRVVNFEARERKLVLLMX
147
+ end
148
+ it "stopstop(0) should render ORFs" do
149
+ @orflist[0].aa.seq[0..3].should == "IISN"
150
+ @orflist[13].aa.seq[0..3].should == "GFYR"
151
+ @orflist[22].aa.seq[0..3].should == "VLQT"
152
+ end
153
+ it "stopstop(0) should render 33 reading frames and seven ORF" do
154
+ @orflist.size.should == 32
155
+ end
156
+ it "startstop(30) should render ORFs starting with a start codon" do
157
+ orflist = @predictorf.startstop(5)
158
+ # orflist.each do | orf | p [orf.descr,orf] end
159
+ orflist[0].aa.seq.should == "MPFILKDQRT"
160
+ orflist.size.should == 1
161
+ end
162
+ it "should never return an empty sequence" do
163
+ orflist = @predictorf.stopstop(0)
164
+ orflist.each do | orf |
165
+ orf.nt.seq.size.should >= 0
166
+ end
167
+ end
168
+
169
+ it "should return 3 sequences when the minsize is 132" do
170
+ orflist = @predictorf.stopstop(44)
171
+ orflist.size.should == 4
172
+ end
173
+
174
+ it "should return 2 sequences when the minsize is 133" do
175
+ orflist = @predictorf.stopstop(45)
176
+ orflist.size.should == 3
177
+ end
178
+
179
+ it "should have -1 frame" do
180
+ sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
181
+ # >EMBOSS_001_4
182
+ # TRSTSSEL*SEREEAGVAN
183
+ predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
184
+ orflist = predictorf.stopstop(0)
185
+ # orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
186
+ orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
187
+ end
188
+
189
+ it "should correctly handle a sequence starting with a STOP codon" do
190
+ sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
191
+ # >EMBOSS_001_3
192
+ # *QHQLPLSRFKVHYSWIS
193
+ predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
194
+ orflist = predictorf.stopstop(0)
195
+ # orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
196
+ orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
197
+ end
198
+
199
+ end