bio-bigbio 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ # ORF predictor class
2
+ #
3
+
4
+ require 'bigbio/sequence/translate'
5
+
6
+ class ORFsequence
7
+ attr_accessor :seq
8
+ def initialize sequence
9
+ @seq = sequence
10
+ end
11
+ end
12
+
13
+ # Helper class for storing ORF information
14
+ class ORFnucleotides < ORFsequence
15
+ attr_reader :start, :stop
16
+ def initialize sequence, start, stop
17
+ super(sequence)
18
+ @start = start
19
+ @stop = stop
20
+ end
21
+
22
+ def seq
23
+ @seq[@start..@stop-1]
24
+ end
25
+
26
+ def fullseq
27
+ @seq
28
+ end
29
+
30
+ def to_s
31
+ seq
32
+ end
33
+
34
+ end
35
+
36
+ # Helper class for storing ORF information
37
+ class ORFaminoacids < ORFsequence
38
+ end
39
+
40
+ class ORF
41
+ attr_reader :id, :descr, :nt, :aa, :frame
42
+ def initialize num, type, id, descr, nt, frame, start, aa
43
+ @id = id.to_s + '_' + (num + 1).to_s
44
+ # ---- adjust start to match frame
45
+ start += frame.abs-1
46
+ # ---- stop should not go beyond sequence
47
+ stop = start + aa.size * 3
48
+ if stop > nt.size
49
+ stop = nt.size
50
+ end
51
+ # ---- if frame < 0 it should reverse complement
52
+ if frame < 0
53
+ nt = Bio::Sequence::NA.new(nt).reverse_complement.to_s.upcase
54
+ end
55
+ # p [start, stop, stop-start]
56
+ # p nt
57
+ fr = frame.to_s
58
+ fr = '+'+fr if frame > 0
59
+ @descr = "[#{type} #{fr} #{start} - #{stop}; #{stop-start}/#{nt.size}] " + descr
60
+ @nt = ORFnucleotides.new(nt, start, stop)
61
+ @frame = frame
62
+ @aa = ORFaminoacids.new(aa)
63
+ end
64
+
65
+ def <=> other
66
+ if frame == other.frame
67
+ nt.seq <=> other.nt.seq
68
+ else
69
+ frame <=> other.frame
70
+ end
71
+ end
72
+
73
+ def to_fastarec
74
+ aa = FastaRecord.new(@id,@descr,@aa.seq)
75
+ nt = FastaRecord.new(@id,@descr,@nt.seq)
76
+ FastaPairedRecord.new(nt,aa)
77
+ end
78
+ end
79
+
80
+ class PredictORF
81
+
82
+ include Bio::Big::FrameCodonHelpers
83
+
84
+ def initialize id, descr, seq, trn_table
85
+ @id = id
86
+ @descr = descr
87
+ @seq = seq.gsub(/\s/,'')
88
+ @trn_table = trn_table
89
+ @startcodons = # FIXME: this should be linked properly
90
+ if trn_table == nil or trn_table == 0
91
+ START_CODONS
92
+ else # prokaryote
93
+ ['ATG','TTG','CTG','AUG','UUG','CUG']
94
+ end
95
+ end
96
+
97
+ # Return a list of predicted ORFs with :minsize AA's. The ORF's
98
+ # are between STOP codons (so sequences without a proper START codon
99
+ # are included)
100
+ def stopstop minsize=30
101
+ type = "XX"
102
+ orfs = []
103
+ translate = Nucleotide::Translate.new(@trn_table)
104
+ aa_frames = translate.aa_6_frames(@seq)
105
+ # p @seq
106
+ # pp aa_frames
107
+ num = 0
108
+ aa_frames.each do | aa_frame |
109
+ frame = aa_frame[:frame]
110
+ aa = aa_frame[:sequence]
111
+ aa_start = 0
112
+ aa.split(/\*/).each do | candidate |
113
+ if candidate.size >= minsize and candidate.size > 0
114
+ orf = ORF.new(num,type,@id,@descr,@seq,frame,aa_start*3,candidate)
115
+ orfs.push orf
116
+ num += 1
117
+ end
118
+ aa_start += candidate.size + 1
119
+ end
120
+ end
121
+ orfs
122
+ end
123
+
124
+ # Return a list of predicted ORFs with :minsize AA's. The ORF's
125
+ # are between START and STOP codons (ATG, TTG, CTG and AUG, UUG and CUG for
126
+ # now, a later version should use the EMBOSS translation table).
127
+ def startstop minsize=30
128
+ stopstop(minsize).find_all { | orf |
129
+ # p [orf.nt.seq[0..2].upcase,@startcodons.include?(orf.nt.seq[0..2].upcase)]
130
+ @startcodons.include?(orf.nt.seq[0..2].upcase)
131
+ }
132
+ end
133
+
134
+ # Return the longest ORF that has a START codon (see +startstop+)
135
+ # Returns nil if none is found
136
+ def longest_startstop minsize=0
137
+ startstop(minsize).first
138
+ end
139
+
140
+ end
@@ -0,0 +1,52 @@
1
+
2
+ module Nucleotide
3
+
4
+ module TranslationTable
5
+ end
6
+
7
+ class Translate
8
+
9
+ include Bio::Big::TranslationAdapter
10
+
11
+ # Table can be either an id (integer) or a Biolib::Emboss TrnTable
12
+
13
+ def initialize table
14
+ table = 0 if table == nil
15
+ if table.kind_of? Numeric
16
+ @trn_table = Bio::Big::TranslationAdapter.translation_table(table)
17
+ else
18
+ @trn_table = table
19
+ end
20
+ end
21
+
22
+ # Return all six reading frames as an Array - ordered as
23
+ # frames [1,2,3,-1,-2,-3] with as tuples [frame, AAsequence].
24
+ #
25
+ # Note that the nucleotide sequence does not get modified.
26
+
27
+ def aa_6_frames seq
28
+ res = []
29
+ # remove white space
30
+ seq = seq.gsub(/\s/,'')
31
+ [1,2,3,-1,-2,-3].each do | frame |
32
+ aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
33
+ res.push({:frame => frame, :sequence => aa})
34
+ end
35
+ res
36
+ end
37
+ # Return all forward reading frames as an Array - ordered as
38
+ # frames [1,2,3] with as tuples [frame, AAsequence]
39
+
40
+ def aa_forward_frames seq
41
+ res = []
42
+ # remove white space
43
+ seq = seq.gsub(/\s/,'')
44
+ [1,2,3].each do | frame |
45
+ aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
46
+ res.push({:frame => frame, :sequence => aa})
47
+ end
48
+ res
49
+ end
50
+ end
51
+ end
52
+
data/lib/bigbio.rb ADDED
@@ -0,0 +1,38 @@
1
+ # BigBio libraries
2
+
3
+ require 'bigbio/environment'
4
+
5
+ # find local plugin installation, and use it when there
6
+ rootpath = File.dirname(File.dirname(__FILE__))
7
+ bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
8
+ if File.directory? bio_logger_path
9
+ $: << bio_logger_path
10
+ $stderr.print "bio-logger loaded directly\n"
11
+ else
12
+ require "rubygems"
13
+ gem "bio-logger"
14
+ end
15
+ require 'bio-logger'
16
+
17
+ log = Bio::Log::LoggerPlus.new('bigbio')
18
+ Bio::Big::Environment.instance.log = log
19
+
20
+ begin
21
+ require 'biolib/emboss'
22
+ Bio::Big::Environment.instance.biolib = true
23
+ rescue LoadError
24
+ log.outputters = Bio::Log::Outputter.stderr
25
+ log.warn "BioLib functionality not loaded"
26
+ end
27
+
28
+ require 'bio'
29
+ require 'bigbio/adapters/translate'
30
+ require 'bigbio/db/emitters/fasta_emitter'
31
+ require 'bigbio/db/emitters/orf_emitter'
32
+
33
+ autoload :FastaReader, 'bigbio/db/fasta'
34
+ autoload :FastaWriter, 'bigbio/db/fasta'
35
+ autoload :FastaPairedReader, 'bigbio/db/fasta'
36
+ autoload :FastaPairedWriter, 'bigbio/db/fasta'
37
+ autoload :BlastClust, 'bigbio/db/blast'
38
+ autoload :PredictORF, 'bigbio/sequence/predictorf'
@@ -0,0 +1,265 @@
1
+
2
+ require 'rspec'
3
+
4
+ $: << "../lib"
5
+
6
+ require 'bigbio'
7
+
8
+ describe Bio::Big::FastaEmitter, "when using the emitter" do
9
+ include Bio::Big
10
+
11
+ it "should emit small parts" do
12
+ s = ""
13
+ FastaEmitter.new("test/data/fasta/nt.fa",10).emit_seq do | part, index, tag, seq |
14
+ # p [index, part, tag, seq]
15
+ s += seq
16
+ if index == 95 and part == :tail
17
+ s.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
18
+ end
19
+ s = "" if part == :tail
20
+ end
21
+ end
22
+
23
+ it "should emit large parts" do
24
+ FastaEmitter.new("test/data/fasta/nt.fa").emit_seq do | part, index, tag, seq |
25
+ # p [index, part, tag, seq]
26
+ if index == 95
27
+ seq.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ describe Bio::Big::ShortFrameState, "when using the ShortFrameState" do
34
+
35
+ include Bio::Big
36
+
37
+ it "should find an ORF" do
38
+ fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,0
39
+ orfs = fr.get_stopstop_orfs
40
+ orfs.map{ | orf | orf.pos }.should == [ 3, 5 ]
41
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
42
+ orfs = fr.get_startstop_orfs
43
+ orfs.map{ | orf | orf.pos }.should == [ 0, 3 ]
44
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTAA","ATGTAA"]
45
+ end
46
+ it "should handle min_size" do
47
+ fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,9
48
+ orfs = fr.get_stopstop_orfs
49
+ orfs.map{ | orf | orf.to_seq }.should == [ "TGGATTTAA"]
50
+ orfs.map{ | orf | orf.pos }.should == [ 5 ]
51
+ fr.get_startstop_orfs.should == []
52
+ end
53
+ it "should find ORFs in" do
54
+ fr = ShortFrameState.new "atgttttaaatgtaatgttgttaaatgttttaaatgtaatgttgttaa",0,0
55
+ orfs = fr.get_stopstop_orfs
56
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGTTGTTAA", "ATGTTTTAA", "ATGTAA", "TGTTGTTAA"]
57
+ orfs.map{ | orf | orf.pos }.should == [3, 5, 8, 11, 13]
58
+ orfs = fr.get_startstop_orfs
59
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTTTTAA", "ATGTAA", "ATGTTTTAA", "ATGTAA"]
60
+ orfs.map{ | orf | orf.pos }.should == [ 0, 3, 8, 11]
61
+ end
62
+
63
+ it "should match results of EMBOSS getorf" do
64
+ s = "AG GTTCGNACGGTCATCGNATNAAGTCTTGNATATCG TAA TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTG GGG AAAACTTTG TGA GCAAAGAGCGAGAAAATGAGCGGANCGG TAA GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC TAA GGTCCTTGTCACAGATGAGGCTCGTAGAG".gsub(/ /,'')
65
+ # >_3 [3 - 167] #0
66
+ # 1st GTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTTCT
67
+ # CCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCA
68
+ # GTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAA
69
+ # >_4 [171 - 179] #0
70
+ # OK GCGGANCGG
71
+ # >_5 [183 - 239] #0
72
+ # OK GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC
73
+ # >_6 [243 - 257] #0
74
+ # OK- GGTCCTTGTCACAGA
75
+ # >_7 [261 - 266] #0
76
+ # OK- GGCTCGTAG
77
+ # >_8 [1 - 270] # 1
78
+ # whole! AGGTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTT
79
+ # CTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGT
80
+ # CAGTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAATGAGCGGANCGGT
81
+ # AAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCT
82
+ # AAGGTCCTTGTCACAGATGAGGCTCGTAGA
83
+ # >_1 [2 - 37] #2
84
+ # 1st- GGTTCGNACGGTCATCGNATNAAGTCTTGNATATCG
85
+ # >_2 [41 - 148] #2
86
+ # OK- TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCT
87
+ # TCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTG
88
+ # >_9 [152 - 271] #2
89
+ # last- CAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTT
90
+ # CAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
91
+
92
+ # Frame 0
93
+ minsize = 0
94
+ fr = ShortFrameState.new s,0,minsize
95
+ orfs = fr.get_stopstop_orfs
96
+ orfs.map{ | orf | orf.to_seq }.should == []
97
+
98
+ # Frame 1
99
+ fr = ShortFrameState.new s[1..-1],0,minsize
100
+ os = fr.get_stopstop_orfs
101
+ os.map{ | orf | orf.to_seq }.should == ["TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"]
102
+ orfs += os
103
+ # Frame 2
104
+ fr = ShortFrameState.new s[2..-1],0,minsize
105
+ os = fr.get_stopstop_orfs
106
+ os.map{ | orf | orf.to_seq }.should == ["GCGGANCGGTAA", "GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAA", "GGTCCTTGTCACAGATGA", "GGCTCGTAG"]
107
+ orfs += os
108
+ orfs.size.should == 5
109
+
110
+ # >_1 [235 - 270]
111
+ # Last: ATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGA
112
+ # >_2 [167 - 271]
113
+ # Last: ATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATT
114
+ # GGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
115
+
116
+ # Frame 0
117
+ minsize = 0
118
+ fr = ShortFrameState.new s,0,minsize
119
+ orfs = fr.get_startstop_orfs
120
+
121
+ # Frame 1
122
+ fr = ShortFrameState.new s[1..-1],0,minsize
123
+ orfs += fr.get_startstop_orfs
124
+ # Frame 2
125
+ fr = ShortFrameState.new s[2..-1],0,minsize
126
+ orfs += fr.get_startstop_orfs
127
+ orfs.map{ | orf | orf.to_seq }.should == []
128
+ orfs.size.should == 0
129
+ end
130
+ end
131
+
132
+ describe Bio::Big::ShortFrameState, "when combining frames" do
133
+ include Bio::Big
134
+ it "should combine a forward frame" do
135
+ s1 = "atggattaaatgtaata"
136
+ s2 = "atggatttaatgtaaa"
137
+ fr = ShortFrameState.new s1,0,0
138
+ fr.ntseq_pos.should == 0
139
+ orfs = fr.get_stopstop_orfs
140
+ orfs.size == 1 # in codons
141
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
142
+ fr3.ntseq_pos.should == 15
143
+ fr3.codons.to_seq.should == "TAATGGATTTAATGTAAA"
144
+ norfs = fr3.get_stopstop_orfs
145
+ orfs += norfs
146
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
147
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,18]
148
+ end
149
+
150
+ it "should combine a forward frame without ORFs in first seq" do
151
+ s1 = "atggattaaatgta"
152
+ # ......---===xx
153
+ s2 = "atggatttaattattataaa"
154
+ # x======xxx======xxx.
155
+ fr = ShortFrameState.new s1,0,0
156
+ fr.ntseq_pos.should == 0
157
+ orfs = fr.get_stopstop_orfs
158
+ orfs.size == 0 # in codons
159
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
160
+ fr3.ntseq_pos.should == 0
161
+ fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATTATTATAA"
162
+ norfs = fr3.get_stopstop_orfs
163
+ orfs = norfs
164
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA","TGGATTTAA","TTATTATAA"]
165
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6,9+6+9]
166
+ end
167
+
168
+ it "should combine a forward frame without ORFs in first seq" do
169
+ s1 = "atggattaaatgta"
170
+ # ......---===xx
171
+ s2 = "atggatttaatgtaaa"
172
+ # x======xxx
173
+ fr = ShortFrameState.new s1,0,0
174
+ fr.ntseq_pos.should == 0
175
+ orfs = fr.get_stopstop_orfs
176
+ orfs.size == 0 # in codons
177
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
178
+ # p fr3
179
+ fr3.ntseq_pos.should == 0 # on the combined sequences
180
+ fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATGTAAA"
181
+ norfs = fr3.get_stopstop_orfs
182
+ orfs += norfs
183
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
184
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6]
185
+ end
186
+
187
+ it "should combine a reverse frame" do
188
+ # Reversed (real locations on contig):
189
+ #
190
+ # | 3 21 B |
191
+ # ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
192
+ # ...^--============xxx^=======xxx
193
+ # ^ ^
194
+ # Actual feed:
195
+ #
196
+ # s2= s1=
197
+ # 18 0 (ntseq_pos)
198
+ # "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
199
+ # ......xxx===== ~===xx^============--^...
200
+ # 0 1 2 3 0 1 2 3
201
+
202
+ s2 = "tatttaaatggatttaatgtaaatt"
203
+ # ~===xx^============--^...
204
+ s1 = "atggattaaatgta"
205
+ # ......xxx=====
206
+ # now move the other way, as sequences get emitted on the left
207
+ fr = ShortReversedFrameState.new s2,0,0
208
+ # p fr
209
+ fr.codons.to_seq.should == "ATTTAAATGGATTTAATGTAAATT"
210
+ fr.ntseq_pos.should == 0
211
+ orfs = fr.get_stopstop_orfs
212
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA"]
213
+ orfs.first.pos.should == 2 # in codons
214
+ fr3 = FrameCodonHelpers::CreateShortFrame.create_left(fr,orfs,s1)
215
+ fr3.ntseq_pos.should == 18 # 6 codons
216
+ fr3.codons.to_seq.should == "ATGGATTAAATGTATATTTAA"
217
+ norfs = fr3.get_stopstop_orfs
218
+ orfs += norfs
219
+ orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA", "ATGTATATTTAA"]
220
+ orfs.map{ | orf | orf.pos }.should == [2,3]
221
+ orfs.map{ | orf | orf.track_ntseq_pos }.should == [6,18+9]
222
+ end
223
+
224
+ end
225
+
226
+ describe Bio::Big::OrfEmitter, "when using the ORF emitter" do
227
+ include Bio::Big
228
+
229
+ it "should emit STOP-STOP ORFs in all frames" do
230
+ f = FastaEmitter.new("test/data/fasta/nt.fa")
231
+ seqs = []
232
+ OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
233
+ break if index != 0
234
+ if frame == 0 and index == 0 and pos == 39
235
+ seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
236
+ # p [frame,index, tag, pos, seq]
237
+ end
238
+ if index == 0
239
+ seqs.push seq
240
+ end
241
+ end
242
+ seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
243
+ end
244
+ it "should emit STOP-STOP ORFs in all frames using a shorter emitter" do
245
+ f = FastaEmitter.new("test/data/fasta/nt.fa",150)
246
+ seqs = []
247
+ OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
248
+ break if index != 0
249
+ if frame == 0 and index == 0 and pos == 39
250
+ seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
251
+ # p [frame,index, tag, pos, seq]
252
+ end
253
+ if index == 0
254
+ seqs.push seq
255
+ end
256
+ end
257
+ # seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
258
+ end
259
+ if false
260
+ it "should emit START-STOP ORFs in all frames"
261
+ it "should emit ORFs on any filter"
262
+ it "should emit ORFs using a minimum size"
263
+ it "should emit ORFs with adjoining sequences"
264
+ end
265
+ end
@@ -0,0 +1,199 @@
1
+
2
+ $: << "../lib"
3
+ ENV['DATA'] = '../test/data/EMBOSS'
4
+
5
+ require 'bigbio'
6
+
7
+ # Note that PredictORF, at this point, leaves trailing X's for the AA sequence
8
+
9
+ describe PredictORF, " when using a short simple nucleotide sequence" do
10
+ before :all do
11
+ # initialize
12
+ id = 'test'
13
+ descr = 'Test'
14
+ # sequence = 'AGCTGAATCTGGTAGATACCATCTTTAA'
15
+ sequence = 'AGCTGAATCTGG'
16
+ # trn_table = Biolib::Emboss.ajTrnNewI(1)
17
+ trn_table = Bio::Big::TranslationAdapter.translation_table(1)
18
+
19
+ @predictorf = PredictORF.new(id,descr,sequence,trn_table)
20
+ @orflist = @predictorf.stopstop(0)
21
+ # @orflist.each do | orf | p [orf.descr,orf] end
22
+ end
23
+
24
+ it "stopstop(0) should render six reading frames and seven ORF" do
25
+ # >EMBOSS_001_1
26
+ # S*IW
27
+ # >EMBOSS_001_2
28
+ # AESG
29
+ # >EMBOSS_001_3
30
+ # LNLX
31
+ # >EMBOSS_001_4
32
+ # PDSA
33
+ # >EMBOSS_001_5
34
+ # RFSX
35
+ # >EMBOSS_001_6
36
+ # QIQL
37
+ @orflist[0].aa.seq.should == "S"
38
+ @orflist[3].aa.seq[0..2].should == "LNL"
39
+ @orflist[4].aa.seq[0..2].should == "PDS"
40
+ @orflist.size.should == 7
41
+ end
42
+
43
+ # frame +1 - 4 codons S*IW
44
+ it "should give a first valid +1 frame" do
45
+ orf = @orflist[1]
46
+ orf.frame.should == 1
47
+ orf.nt.start.should == 6
48
+ orf.aa.seq.should == "IW"
49
+ orf.nt.seq.should == "ATCTGG"
50
+ end
51
+
52
+ # frame +1 - 4 codons S*IW
53
+ it "should give a second valid +1 frame" do
54
+ orf = @orflist[0]
55
+ orf.frame.should equal 1
56
+ orf.nt.start.should equal 0
57
+ orf.aa.seq.should == "S"
58
+ orf.nt.seq.should == "AGC"
59
+ end
60
+
61
+ # frame +2 - 3 codons AES
62
+ it "should give a valid +2 frame" do
63
+ orf = @orflist[2]
64
+ # pp @orflist
65
+ # pp orf
66
+ orf.nt.seq[0..8].should == "GCTGAATCT"
67
+ orf.frame.should == 2
68
+ orf.nt.start.should == 1
69
+ # orf.nt.stop.should == 12 - EMBOS differs
70
+ orf.aa.seq[0..2].should == "AES"
71
+ end
72
+
73
+ # frame +3 - 3 codons LNL
74
+ it "should give a valid +3 frame" do
75
+ orf = @orflist[3]
76
+ orf.frame.should == 3
77
+ orf.nt.start.should == 2
78
+ # orf.nt.stop.should == 12
79
+ orf.aa.seq[0..3].should == "LNL"
80
+ orf.nt.seq[0..9].should == "CTGAATCTG"
81
+ end
82
+
83
+ # frame -1 - 4 codons PDSA
84
+ it "should give a valid -1 frame" do
85
+ orf = @orflist[4]
86
+ orf.frame.should == -1
87
+ orf.nt.start.should == 0
88
+ orf.nt.stop.should == 12
89
+ orf.aa.seq.should == "PDSA"
90
+ orf.nt.seq.should == "CCAGATTCAGCT"
91
+ end
92
+
93
+ # frame -2 - 3 codons RFSX
94
+ it "should give a valid -3 frame" do
95
+ orf = @orflist[5]
96
+ orf.frame.should == -2
97
+ orf.nt.start.should == 1
98
+ # orf.nt.stop.should == 12
99
+ orf.aa.seq[0..2].should == "RFS"
100
+ orf.nt.seq[0..8].should == "CAGATTCAG"
101
+ end
102
+
103
+ # frame -3 - 3 codons QIQL
104
+ it "should give a valid -3 frame" do
105
+ orf = @orflist[6]
106
+ orf.frame.should == -3
107
+ orf.nt.start.should == 2
108
+ # orf.nt.stop.should == 12
109
+ orf.aa.seq[0..2].should == "QIQ"
110
+ orf.nt.seq[0..8].should == "AGATTCAGC"
111
+ end
112
+ end
113
+
114
+ describe PredictORF, " when using a more complicated nucleotide sequence" do
115
+ before :all do
116
+ # initialize
117
+ id = "PUT-157a-Arabidopsis_thaliana-126"
118
+ descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
119
+ sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
120
+ CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
121
+ TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
122
+ AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
123
+ CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
124
+ # @trn_table = Biolib::Emboss.ajTrnNewI(1)
125
+ @trn_table = Bio::Big::TranslationAdapter.translation_table(1)
126
+ @predictorf = PredictORF.new(id,descr,sequence,@trn_table)
127
+ @orflist = @predictorf.stopstop(0)
128
+ # @orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
129
+ # >EMBOSS_001_1
130
+ # IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSF
131
+ # S*IW*IPSLHIVCLSWASRCLSYLKIKGLDDAISFSSVKP
132
+ # >EMBOSS_001_2
133
+ # SLATPASSLSLQSSLLVDLVFSVQYQGLDLRWMRHQIQELFKELHRHP*LRFKRFQFSVS
134
+ # AESGRYHLYISYACHGLLDAFHT*RSKDLTMQ*ASRL*N
135
+ # >EMBOSS_001_3
136
+ # H*QHQLPLSRFKVHYSWISSSVYSIKGSICGG*DIRFRSFSKNCIDIRNFVSKDSNSQFQ
137
+ # LNLVDTIFTYRMLVMGF*MPFILKDQRT*RCNKLLVCKT
138
+ # >EMBOSS_001_4
139
+ # GFYRREAYCIVKSFDL*V*KASRSP*QAYDM*RWYLPDSAETENWNLLKRSYGCRCSSLK
140
+ # SS*I*CLIHRRSSP*YCTLKTRSTSSEL*SEREEAGVAND
141
+ # >EMBOSS_001_5
142
+ # VLQTRSLLHRQVL*SLSMKGI*KPMTSIRYVKMVSTRFS*N*ELESFETKLRMSMQFFEK
143
+ # LLNLMSHPPQIEPLILYTEDEIHE**TLKRERGSWCC**X
144
+ # >EMBOSS_001_6
145
+ # GFTDEKLIASSSPLIFKYERHLEAHDKHTICKDGIYQIQLKLRIGIF*NEVTDVDAVL*K
146
+ # APESDVSSTADRALDTVH*RRDPRVVNFEARERKLVLLMX
147
+ end
148
+ it "stopstop(0) should render ORFs" do
149
+ @orflist[0].aa.seq[0..3].should == "IISN"
150
+ @orflist[13].aa.seq[0..3].should == "GFYR"
151
+ @orflist[22].aa.seq[0..3].should == "VLQT"
152
+ end
153
+ it "stopstop(0) should render 33 reading frames and seven ORF" do
154
+ @orflist.size.should == 32
155
+ end
156
+ it "startstop(30) should render ORFs starting with a start codon" do
157
+ orflist = @predictorf.startstop(5)
158
+ # orflist.each do | orf | p [orf.descr,orf] end
159
+ orflist[0].aa.seq.should == "MPFILKDQRT"
160
+ orflist.size.should == 1
161
+ end
162
+ it "should never return an empty sequence" do
163
+ orflist = @predictorf.stopstop(0)
164
+ orflist.each do | orf |
165
+ orf.nt.seq.size.should >= 0
166
+ end
167
+ end
168
+
169
+ it "should return 3 sequences when the minsize is 132" do
170
+ orflist = @predictorf.stopstop(44)
171
+ orflist.size.should == 4
172
+ end
173
+
174
+ it "should return 2 sequences when the minsize is 133" do
175
+ orflist = @predictorf.stopstop(45)
176
+ orflist.size.should == 3
177
+ end
178
+
179
+ it "should have -1 frame" do
180
+ sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
181
+ # >EMBOSS_001_4
182
+ # TRSTSSEL*SEREEAGVAN
183
+ predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
184
+ orflist = predictorf.stopstop(0)
185
+ # orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
186
+ orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
187
+ end
188
+
189
+ it "should correctly handle a sequence starting with a STOP codon" do
190
+ sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
191
+ # >EMBOSS_001_3
192
+ # *QHQLPLSRFKVHYSWIS
193
+ predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
194
+ orflist = predictorf.stopstop(0)
195
+ # orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
196
+ orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
197
+ end
198
+
199
+ end