bio-bigbio 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +15 -0
- data/Gemfile.lock +34 -0
- data/LICENSE +34 -0
- data/README.rdoc +28 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/getorf +118 -0
- data/bin/nt2aa.rb +56 -0
- data/bio-bigbio.gemspec +102 -0
- data/doc/bigbio_getorf.wtex +14 -0
- data/lib/bigbio/adapters/translate.rb +64 -0
- data/lib/bigbio/db/blast/blastclust.rb +16 -0
- data/lib/bigbio/db/blast.rb +2 -0
- data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
- data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
- data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
- data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
- data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
- data/lib/bigbio/db/fasta/fastareader.rb +132 -0
- data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
- data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
- data/lib/bigbio/db/fasta/indexer.rb +33 -0
- data/lib/bigbio/db/fasta.rb +13 -0
- data/lib/bigbio/environment.rb +12 -0
- data/lib/bigbio/sequence/predictorf.rb +140 -0
- data/lib/bigbio/sequence/translate.rb +52 -0
- data/lib/bigbio.rb +38 -0
- data/spec/emitter_spec.rb +265 -0
- data/spec/predictorf_spec.rb +199 -0
- data/test/data/EMBOSS/EGC.1 +32 -0
- data/test/data/fasta/nt.fa +1000 -0
- data/test/doctest/test_fasta.rb +112 -0
- data/test/doctest/test_frames.rb +76 -0
- data/test/doctest/test_getorf.rb +154 -0
- data/test/doctest/test_paired.rb +55 -0
- data/test/performance/translate_with_biolib.rb +67 -0
- data/test/performance/translate_with_bioruby.rb +64 -0
- metadata +163 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
# ORF predictor class
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'bigbio/sequence/translate'
|
5
|
+
|
6
|
+
class ORFsequence
|
7
|
+
attr_accessor :seq
|
8
|
+
def initialize sequence
|
9
|
+
@seq = sequence
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Helper class for storing ORF information
|
14
|
+
class ORFnucleotides < ORFsequence
|
15
|
+
attr_reader :start, :stop
|
16
|
+
def initialize sequence, start, stop
|
17
|
+
super(sequence)
|
18
|
+
@start = start
|
19
|
+
@stop = stop
|
20
|
+
end
|
21
|
+
|
22
|
+
def seq
|
23
|
+
@seq[@start..@stop-1]
|
24
|
+
end
|
25
|
+
|
26
|
+
def fullseq
|
27
|
+
@seq
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
seq
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
# Helper class for storing ORF information
|
37
|
+
class ORFaminoacids < ORFsequence
|
38
|
+
end
|
39
|
+
|
40
|
+
class ORF
|
41
|
+
attr_reader :id, :descr, :nt, :aa, :frame
|
42
|
+
def initialize num, type, id, descr, nt, frame, start, aa
|
43
|
+
@id = id.to_s + '_' + (num + 1).to_s
|
44
|
+
# ---- adjust start to match frame
|
45
|
+
start += frame.abs-1
|
46
|
+
# ---- stop should not go beyond sequence
|
47
|
+
stop = start + aa.size * 3
|
48
|
+
if stop > nt.size
|
49
|
+
stop = nt.size
|
50
|
+
end
|
51
|
+
# ---- if frame < 0 it should reverse complement
|
52
|
+
if frame < 0
|
53
|
+
nt = Bio::Sequence::NA.new(nt).reverse_complement.to_s.upcase
|
54
|
+
end
|
55
|
+
# p [start, stop, stop-start]
|
56
|
+
# p nt
|
57
|
+
fr = frame.to_s
|
58
|
+
fr = '+'+fr if frame > 0
|
59
|
+
@descr = "[#{type} #{fr} #{start} - #{stop}; #{stop-start}/#{nt.size}] " + descr
|
60
|
+
@nt = ORFnucleotides.new(nt, start, stop)
|
61
|
+
@frame = frame
|
62
|
+
@aa = ORFaminoacids.new(aa)
|
63
|
+
end
|
64
|
+
|
65
|
+
def <=> other
|
66
|
+
if frame == other.frame
|
67
|
+
nt.seq <=> other.nt.seq
|
68
|
+
else
|
69
|
+
frame <=> other.frame
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_fastarec
|
74
|
+
aa = FastaRecord.new(@id,@descr,@aa.seq)
|
75
|
+
nt = FastaRecord.new(@id,@descr,@nt.seq)
|
76
|
+
FastaPairedRecord.new(nt,aa)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class PredictORF
|
81
|
+
|
82
|
+
include Bio::Big::FrameCodonHelpers
|
83
|
+
|
84
|
+
def initialize id, descr, seq, trn_table
|
85
|
+
@id = id
|
86
|
+
@descr = descr
|
87
|
+
@seq = seq.gsub(/\s/,'')
|
88
|
+
@trn_table = trn_table
|
89
|
+
@startcodons = # FIXME: this should be linked properly
|
90
|
+
if trn_table == nil or trn_table == 0
|
91
|
+
START_CODONS
|
92
|
+
else # prokaryote
|
93
|
+
['ATG','TTG','CTG','AUG','UUG','CUG']
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return a list of predicted ORFs with :minsize AA's. The ORF's
|
98
|
+
# are between STOP codons (so sequences without a proper START codon
|
99
|
+
# are included)
|
100
|
+
def stopstop minsize=30
|
101
|
+
type = "XX"
|
102
|
+
orfs = []
|
103
|
+
translate = Nucleotide::Translate.new(@trn_table)
|
104
|
+
aa_frames = translate.aa_6_frames(@seq)
|
105
|
+
# p @seq
|
106
|
+
# pp aa_frames
|
107
|
+
num = 0
|
108
|
+
aa_frames.each do | aa_frame |
|
109
|
+
frame = aa_frame[:frame]
|
110
|
+
aa = aa_frame[:sequence]
|
111
|
+
aa_start = 0
|
112
|
+
aa.split(/\*/).each do | candidate |
|
113
|
+
if candidate.size >= minsize and candidate.size > 0
|
114
|
+
orf = ORF.new(num,type,@id,@descr,@seq,frame,aa_start*3,candidate)
|
115
|
+
orfs.push orf
|
116
|
+
num += 1
|
117
|
+
end
|
118
|
+
aa_start += candidate.size + 1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
orfs
|
122
|
+
end
|
123
|
+
|
124
|
+
# Return a list of predicted ORFs with :minsize AA's. The ORF's
|
125
|
+
# are between START and STOP codons (ATG, TTG, CTG and AUG, UUG and CUG for
|
126
|
+
# now, a later version should use the EMBOSS translation table).
|
127
|
+
def startstop minsize=30
|
128
|
+
stopstop(minsize).find_all { | orf |
|
129
|
+
# p [orf.nt.seq[0..2].upcase,@startcodons.include?(orf.nt.seq[0..2].upcase)]
|
130
|
+
@startcodons.include?(orf.nt.seq[0..2].upcase)
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the longest ORF that has a START codon (see +startstop+)
|
135
|
+
# Returns nil if none is found
|
136
|
+
def longest_startstop minsize=0
|
137
|
+
startstop(minsize).first
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
module Nucleotide
|
3
|
+
|
4
|
+
module TranslationTable
|
5
|
+
end
|
6
|
+
|
7
|
+
class Translate
|
8
|
+
|
9
|
+
include Bio::Big::TranslationAdapter
|
10
|
+
|
11
|
+
# Table can be either an id (integer) or a Biolib::Emboss TrnTable
|
12
|
+
|
13
|
+
def initialize table
|
14
|
+
table = 0 if table == nil
|
15
|
+
if table.kind_of? Numeric
|
16
|
+
@trn_table = Bio::Big::TranslationAdapter.translation_table(table)
|
17
|
+
else
|
18
|
+
@trn_table = table
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return all six reading frames as an Array - ordered as
|
23
|
+
# frames [1,2,3,-1,-2,-3] with as tuples [frame, AAsequence].
|
24
|
+
#
|
25
|
+
# Note that the nucleotide sequence does not get modified.
|
26
|
+
|
27
|
+
def aa_6_frames seq
|
28
|
+
res = []
|
29
|
+
# remove white space
|
30
|
+
seq = seq.gsub(/\s/,'')
|
31
|
+
[1,2,3,-1,-2,-3].each do | frame |
|
32
|
+
aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
|
33
|
+
res.push({:frame => frame, :sequence => aa})
|
34
|
+
end
|
35
|
+
res
|
36
|
+
end
|
37
|
+
# Return all forward reading frames as an Array - ordered as
|
38
|
+
# frames [1,2,3] with as tuples [frame, AAsequence]
|
39
|
+
|
40
|
+
def aa_forward_frames seq
|
41
|
+
res = []
|
42
|
+
# remove white space
|
43
|
+
seq = seq.gsub(/\s/,'')
|
44
|
+
[1,2,3].each do | frame |
|
45
|
+
aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
|
46
|
+
res.push({:frame => frame, :sequence => aa})
|
47
|
+
end
|
48
|
+
res
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
data/lib/bigbio.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# BigBio libraries
|
2
|
+
|
3
|
+
require 'bigbio/environment'
|
4
|
+
|
5
|
+
# find local plugin installation, and use it when there
|
6
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
7
|
+
bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
|
8
|
+
if File.directory? bio_logger_path
|
9
|
+
$: << bio_logger_path
|
10
|
+
$stderr.print "bio-logger loaded directly\n"
|
11
|
+
else
|
12
|
+
require "rubygems"
|
13
|
+
gem "bio-logger"
|
14
|
+
end
|
15
|
+
require 'bio-logger'
|
16
|
+
|
17
|
+
log = Bio::Log::LoggerPlus.new('bigbio')
|
18
|
+
Bio::Big::Environment.instance.log = log
|
19
|
+
|
20
|
+
begin
|
21
|
+
require 'biolib/emboss'
|
22
|
+
Bio::Big::Environment.instance.biolib = true
|
23
|
+
rescue LoadError
|
24
|
+
log.outputters = Bio::Log::Outputter.stderr
|
25
|
+
log.warn "BioLib functionality not loaded"
|
26
|
+
end
|
27
|
+
|
28
|
+
require 'bio'
|
29
|
+
require 'bigbio/adapters/translate'
|
30
|
+
require 'bigbio/db/emitters/fasta_emitter'
|
31
|
+
require 'bigbio/db/emitters/orf_emitter'
|
32
|
+
|
33
|
+
autoload :FastaReader, 'bigbio/db/fasta'
|
34
|
+
autoload :FastaWriter, 'bigbio/db/fasta'
|
35
|
+
autoload :FastaPairedReader, 'bigbio/db/fasta'
|
36
|
+
autoload :FastaPairedWriter, 'bigbio/db/fasta'
|
37
|
+
autoload :BlastClust, 'bigbio/db/blast'
|
38
|
+
autoload :PredictORF, 'bigbio/sequence/predictorf'
|
@@ -0,0 +1,265 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
|
4
|
+
$: << "../lib"
|
5
|
+
|
6
|
+
require 'bigbio'
|
7
|
+
|
8
|
+
describe Bio::Big::FastaEmitter, "when using the emitter" do
|
9
|
+
include Bio::Big
|
10
|
+
|
11
|
+
it "should emit small parts" do
|
12
|
+
s = ""
|
13
|
+
FastaEmitter.new("test/data/fasta/nt.fa",10).emit_seq do | part, index, tag, seq |
|
14
|
+
# p [index, part, tag, seq]
|
15
|
+
s += seq
|
16
|
+
if index == 95 and part == :tail
|
17
|
+
s.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
18
|
+
end
|
19
|
+
s = "" if part == :tail
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should emit large parts" do
|
24
|
+
FastaEmitter.new("test/data/fasta/nt.fa").emit_seq do | part, index, tag, seq |
|
25
|
+
# p [index, part, tag, seq]
|
26
|
+
if index == 95
|
27
|
+
seq.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe Bio::Big::ShortFrameState, "when using the ShortFrameState" do
|
34
|
+
|
35
|
+
include Bio::Big
|
36
|
+
|
37
|
+
it "should find an ORF" do
|
38
|
+
fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,0
|
39
|
+
orfs = fr.get_stopstop_orfs
|
40
|
+
orfs.map{ | orf | orf.pos }.should == [ 3, 5 ]
|
41
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
42
|
+
orfs = fr.get_startstop_orfs
|
43
|
+
orfs.map{ | orf | orf.pos }.should == [ 0, 3 ]
|
44
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTAA","ATGTAA"]
|
45
|
+
end
|
46
|
+
it "should handle min_size" do
|
47
|
+
fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,9
|
48
|
+
orfs = fr.get_stopstop_orfs
|
49
|
+
orfs.map{ | orf | orf.to_seq }.should == [ "TGGATTTAA"]
|
50
|
+
orfs.map{ | orf | orf.pos }.should == [ 5 ]
|
51
|
+
fr.get_startstop_orfs.should == []
|
52
|
+
end
|
53
|
+
it "should find ORFs in" do
|
54
|
+
fr = ShortFrameState.new "atgttttaaatgtaatgttgttaaatgttttaaatgtaatgttgttaa",0,0
|
55
|
+
orfs = fr.get_stopstop_orfs
|
56
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGTTGTTAA", "ATGTTTTAA", "ATGTAA", "TGTTGTTAA"]
|
57
|
+
orfs.map{ | orf | orf.pos }.should == [3, 5, 8, 11, 13]
|
58
|
+
orfs = fr.get_startstop_orfs
|
59
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTTTTAA", "ATGTAA", "ATGTTTTAA", "ATGTAA"]
|
60
|
+
orfs.map{ | orf | orf.pos }.should == [ 0, 3, 8, 11]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should match results of EMBOSS getorf" do
|
64
|
+
s = "AG GTTCGNACGGTCATCGNATNAAGTCTTGNATATCG TAA TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTG GGG AAAACTTTG TGA GCAAAGAGCGAGAAAATGAGCGGANCGG TAA GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC TAA GGTCCTTGTCACAGATGAGGCTCGTAGAG".gsub(/ /,'')
|
65
|
+
# >_3 [3 - 167] #0
|
66
|
+
# 1st GTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTTCT
|
67
|
+
# CCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCA
|
68
|
+
# GTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAA
|
69
|
+
# >_4 [171 - 179] #0
|
70
|
+
# OK GCGGANCGG
|
71
|
+
# >_5 [183 - 239] #0
|
72
|
+
# OK GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC
|
73
|
+
# >_6 [243 - 257] #0
|
74
|
+
# OK- GGTCCTTGTCACAGA
|
75
|
+
# >_7 [261 - 266] #0
|
76
|
+
# OK- GGCTCGTAG
|
77
|
+
# >_8 [1 - 270] # 1
|
78
|
+
# whole! AGGTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTT
|
79
|
+
# CTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGT
|
80
|
+
# CAGTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAATGAGCGGANCGGT
|
81
|
+
# AAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCT
|
82
|
+
# AAGGTCCTTGTCACAGATGAGGCTCGTAGA
|
83
|
+
# >_1 [2 - 37] #2
|
84
|
+
# 1st- GGTTCGNACGGTCATCGNATNAAGTCTTGNATATCG
|
85
|
+
# >_2 [41 - 148] #2
|
86
|
+
# OK- TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCT
|
87
|
+
# TCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTG
|
88
|
+
# >_9 [152 - 271] #2
|
89
|
+
# last- CAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTT
|
90
|
+
# CAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
|
91
|
+
|
92
|
+
# Frame 0
|
93
|
+
minsize = 0
|
94
|
+
fr = ShortFrameState.new s,0,minsize
|
95
|
+
orfs = fr.get_stopstop_orfs
|
96
|
+
orfs.map{ | orf | orf.to_seq }.should == []
|
97
|
+
|
98
|
+
# Frame 1
|
99
|
+
fr = ShortFrameState.new s[1..-1],0,minsize
|
100
|
+
os = fr.get_stopstop_orfs
|
101
|
+
os.map{ | orf | orf.to_seq }.should == ["TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"]
|
102
|
+
orfs += os
|
103
|
+
# Frame 2
|
104
|
+
fr = ShortFrameState.new s[2..-1],0,minsize
|
105
|
+
os = fr.get_stopstop_orfs
|
106
|
+
os.map{ | orf | orf.to_seq }.should == ["GCGGANCGGTAA", "GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAA", "GGTCCTTGTCACAGATGA", "GGCTCGTAG"]
|
107
|
+
orfs += os
|
108
|
+
orfs.size.should == 5
|
109
|
+
|
110
|
+
# >_1 [235 - 270]
|
111
|
+
# Last: ATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGA
|
112
|
+
# >_2 [167 - 271]
|
113
|
+
# Last: ATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATT
|
114
|
+
# GGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
|
115
|
+
|
116
|
+
# Frame 0
|
117
|
+
minsize = 0
|
118
|
+
fr = ShortFrameState.new s,0,minsize
|
119
|
+
orfs = fr.get_startstop_orfs
|
120
|
+
|
121
|
+
# Frame 1
|
122
|
+
fr = ShortFrameState.new s[1..-1],0,minsize
|
123
|
+
orfs += fr.get_startstop_orfs
|
124
|
+
# Frame 2
|
125
|
+
fr = ShortFrameState.new s[2..-1],0,minsize
|
126
|
+
orfs += fr.get_startstop_orfs
|
127
|
+
orfs.map{ | orf | orf.to_seq }.should == []
|
128
|
+
orfs.size.should == 0
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe Bio::Big::ShortFrameState, "when combining frames" do
|
133
|
+
include Bio::Big
|
134
|
+
it "should combine a forward frame" do
|
135
|
+
s1 = "atggattaaatgtaata"
|
136
|
+
s2 = "atggatttaatgtaaa"
|
137
|
+
fr = ShortFrameState.new s1,0,0
|
138
|
+
fr.ntseq_pos.should == 0
|
139
|
+
orfs = fr.get_stopstop_orfs
|
140
|
+
orfs.size == 1 # in codons
|
141
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
142
|
+
fr3.ntseq_pos.should == 15
|
143
|
+
fr3.codons.to_seq.should == "TAATGGATTTAATGTAAA"
|
144
|
+
norfs = fr3.get_stopstop_orfs
|
145
|
+
orfs += norfs
|
146
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
147
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,18]
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should combine a forward frame without ORFs in first seq" do
|
151
|
+
s1 = "atggattaaatgta"
|
152
|
+
# ......---===xx
|
153
|
+
s2 = "atggatttaattattataaa"
|
154
|
+
# x======xxx======xxx.
|
155
|
+
fr = ShortFrameState.new s1,0,0
|
156
|
+
fr.ntseq_pos.should == 0
|
157
|
+
orfs = fr.get_stopstop_orfs
|
158
|
+
orfs.size == 0 # in codons
|
159
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
160
|
+
fr3.ntseq_pos.should == 0
|
161
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATTATTATAA"
|
162
|
+
norfs = fr3.get_stopstop_orfs
|
163
|
+
orfs = norfs
|
164
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA","TGGATTTAA","TTATTATAA"]
|
165
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6,9+6+9]
|
166
|
+
end
|
167
|
+
|
168
|
+
it "should combine a forward frame without ORFs in first seq" do
|
169
|
+
s1 = "atggattaaatgta"
|
170
|
+
# ......---===xx
|
171
|
+
s2 = "atggatttaatgtaaa"
|
172
|
+
# x======xxx
|
173
|
+
fr = ShortFrameState.new s1,0,0
|
174
|
+
fr.ntseq_pos.should == 0
|
175
|
+
orfs = fr.get_stopstop_orfs
|
176
|
+
orfs.size == 0 # in codons
|
177
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
178
|
+
# p fr3
|
179
|
+
fr3.ntseq_pos.should == 0 # on the combined sequences
|
180
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATGTAAA"
|
181
|
+
norfs = fr3.get_stopstop_orfs
|
182
|
+
orfs += norfs
|
183
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
184
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6]
|
185
|
+
end
|
186
|
+
|
187
|
+
it "should combine a reverse frame" do
|
188
|
+
# Reversed (real locations on contig):
|
189
|
+
#
|
190
|
+
# | 3 21 B |
|
191
|
+
# ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
|
192
|
+
# ...^--============xxx^=======xxx
|
193
|
+
# ^ ^
|
194
|
+
# Actual feed:
|
195
|
+
#
|
196
|
+
# s2= s1=
|
197
|
+
# 18 0 (ntseq_pos)
|
198
|
+
# "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
|
199
|
+
# ......xxx===== ~===xx^============--^...
|
200
|
+
# 0 1 2 3 0 1 2 3
|
201
|
+
|
202
|
+
s2 = "tatttaaatggatttaatgtaaatt"
|
203
|
+
# ~===xx^============--^...
|
204
|
+
s1 = "atggattaaatgta"
|
205
|
+
# ......xxx=====
|
206
|
+
# now move the other way, as sequences get emitted on the left
|
207
|
+
fr = ShortReversedFrameState.new s2,0,0
|
208
|
+
# p fr
|
209
|
+
fr.codons.to_seq.should == "ATTTAAATGGATTTAATGTAAATT"
|
210
|
+
fr.ntseq_pos.should == 0
|
211
|
+
orfs = fr.get_stopstop_orfs
|
212
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA"]
|
213
|
+
orfs.first.pos.should == 2 # in codons
|
214
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_left(fr,orfs,s1)
|
215
|
+
fr3.ntseq_pos.should == 18 # 6 codons
|
216
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTATATTTAA"
|
217
|
+
norfs = fr3.get_stopstop_orfs
|
218
|
+
orfs += norfs
|
219
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA", "ATGTATATTTAA"]
|
220
|
+
orfs.map{ | orf | orf.pos }.should == [2,3]
|
221
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [6,18+9]
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
describe Bio::Big::OrfEmitter, "when using the ORF emitter" do
|
227
|
+
include Bio::Big
|
228
|
+
|
229
|
+
it "should emit STOP-STOP ORFs in all frames" do
|
230
|
+
f = FastaEmitter.new("test/data/fasta/nt.fa")
|
231
|
+
seqs = []
|
232
|
+
OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
|
233
|
+
break if index != 0
|
234
|
+
if frame == 0 and index == 0 and pos == 39
|
235
|
+
seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
|
236
|
+
# p [frame,index, tag, pos, seq]
|
237
|
+
end
|
238
|
+
if index == 0
|
239
|
+
seqs.push seq
|
240
|
+
end
|
241
|
+
end
|
242
|
+
seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
|
243
|
+
end
|
244
|
+
it "should emit STOP-STOP ORFs in all frames using a shorter emitter" do
|
245
|
+
f = FastaEmitter.new("test/data/fasta/nt.fa",150)
|
246
|
+
seqs = []
|
247
|
+
OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
|
248
|
+
break if index != 0
|
249
|
+
if frame == 0 and index == 0 and pos == 39
|
250
|
+
seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
|
251
|
+
# p [frame,index, tag, pos, seq]
|
252
|
+
end
|
253
|
+
if index == 0
|
254
|
+
seqs.push seq
|
255
|
+
end
|
256
|
+
end
|
257
|
+
# seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
|
258
|
+
end
|
259
|
+
if false
|
260
|
+
it "should emit START-STOP ORFs in all frames"
|
261
|
+
it "should emit ORFs on any filter"
|
262
|
+
it "should emit ORFs using a minimum size"
|
263
|
+
it "should emit ORFs with adjoining sequences"
|
264
|
+
end
|
265
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
|
2
|
+
$: << "../lib"
|
3
|
+
ENV['DATA'] = '../test/data/EMBOSS'
|
4
|
+
|
5
|
+
require 'bigbio'
|
6
|
+
|
7
|
+
# Note that PredictORF, at this point, leaves trailing X's for the AA sequence
|
8
|
+
|
9
|
+
describe PredictORF, " when using a short simple nucleotide sequence" do
|
10
|
+
before :all do
|
11
|
+
# initialize
|
12
|
+
id = 'test'
|
13
|
+
descr = 'Test'
|
14
|
+
# sequence = 'AGCTGAATCTGGTAGATACCATCTTTAA'
|
15
|
+
sequence = 'AGCTGAATCTGG'
|
16
|
+
# trn_table = Biolib::Emboss.ajTrnNewI(1)
|
17
|
+
trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
18
|
+
|
19
|
+
@predictorf = PredictORF.new(id,descr,sequence,trn_table)
|
20
|
+
@orflist = @predictorf.stopstop(0)
|
21
|
+
# @orflist.each do | orf | p [orf.descr,orf] end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "stopstop(0) should render six reading frames and seven ORF" do
|
25
|
+
# >EMBOSS_001_1
|
26
|
+
# S*IW
|
27
|
+
# >EMBOSS_001_2
|
28
|
+
# AESG
|
29
|
+
# >EMBOSS_001_3
|
30
|
+
# LNLX
|
31
|
+
# >EMBOSS_001_4
|
32
|
+
# PDSA
|
33
|
+
# >EMBOSS_001_5
|
34
|
+
# RFSX
|
35
|
+
# >EMBOSS_001_6
|
36
|
+
# QIQL
|
37
|
+
@orflist[0].aa.seq.should == "S"
|
38
|
+
@orflist[3].aa.seq[0..2].should == "LNL"
|
39
|
+
@orflist[4].aa.seq[0..2].should == "PDS"
|
40
|
+
@orflist.size.should == 7
|
41
|
+
end
|
42
|
+
|
43
|
+
# frame +1 - 4 codons S*IW
|
44
|
+
it "should give a first valid +1 frame" do
|
45
|
+
orf = @orflist[1]
|
46
|
+
orf.frame.should == 1
|
47
|
+
orf.nt.start.should == 6
|
48
|
+
orf.aa.seq.should == "IW"
|
49
|
+
orf.nt.seq.should == "ATCTGG"
|
50
|
+
end
|
51
|
+
|
52
|
+
# frame +1 - 4 codons S*IW
|
53
|
+
it "should give a second valid +1 frame" do
|
54
|
+
orf = @orflist[0]
|
55
|
+
orf.frame.should equal 1
|
56
|
+
orf.nt.start.should equal 0
|
57
|
+
orf.aa.seq.should == "S"
|
58
|
+
orf.nt.seq.should == "AGC"
|
59
|
+
end
|
60
|
+
|
61
|
+
# frame +2 - 3 codons AES
|
62
|
+
it "should give a valid +2 frame" do
|
63
|
+
orf = @orflist[2]
|
64
|
+
# pp @orflist
|
65
|
+
# pp orf
|
66
|
+
orf.nt.seq[0..8].should == "GCTGAATCT"
|
67
|
+
orf.frame.should == 2
|
68
|
+
orf.nt.start.should == 1
|
69
|
+
# orf.nt.stop.should == 12 - EMBOS differs
|
70
|
+
orf.aa.seq[0..2].should == "AES"
|
71
|
+
end
|
72
|
+
|
73
|
+
# frame +3 - 3 codons LNL
|
74
|
+
it "should give a valid +3 frame" do
|
75
|
+
orf = @orflist[3]
|
76
|
+
orf.frame.should == 3
|
77
|
+
orf.nt.start.should == 2
|
78
|
+
# orf.nt.stop.should == 12
|
79
|
+
orf.aa.seq[0..3].should == "LNL"
|
80
|
+
orf.nt.seq[0..9].should == "CTGAATCTG"
|
81
|
+
end
|
82
|
+
|
83
|
+
# frame -1 - 4 codons PDSA
|
84
|
+
it "should give a valid -1 frame" do
|
85
|
+
orf = @orflist[4]
|
86
|
+
orf.frame.should == -1
|
87
|
+
orf.nt.start.should == 0
|
88
|
+
orf.nt.stop.should == 12
|
89
|
+
orf.aa.seq.should == "PDSA"
|
90
|
+
orf.nt.seq.should == "CCAGATTCAGCT"
|
91
|
+
end
|
92
|
+
|
93
|
+
# frame -2 - 3 codons RFSX
|
94
|
+
it "should give a valid -3 frame" do
|
95
|
+
orf = @orflist[5]
|
96
|
+
orf.frame.should == -2
|
97
|
+
orf.nt.start.should == 1
|
98
|
+
# orf.nt.stop.should == 12
|
99
|
+
orf.aa.seq[0..2].should == "RFS"
|
100
|
+
orf.nt.seq[0..8].should == "CAGATTCAG"
|
101
|
+
end
|
102
|
+
|
103
|
+
# frame -3 - 3 codons QIQL
|
104
|
+
it "should give a valid -3 frame" do
|
105
|
+
orf = @orflist[6]
|
106
|
+
orf.frame.should == -3
|
107
|
+
orf.nt.start.should == 2
|
108
|
+
# orf.nt.stop.should == 12
|
109
|
+
orf.aa.seq[0..2].should == "QIQ"
|
110
|
+
orf.nt.seq[0..8].should == "AGATTCAGC"
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe PredictORF, " when using a more complicated nucleotide sequence" do
|
115
|
+
before :all do
|
116
|
+
# initialize
|
117
|
+
id = "PUT-157a-Arabidopsis_thaliana-126"
|
118
|
+
descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
119
|
+
sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
|
120
|
+
CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
|
121
|
+
TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
|
122
|
+
AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
|
123
|
+
CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
124
|
+
# @trn_table = Biolib::Emboss.ajTrnNewI(1)
|
125
|
+
@trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
126
|
+
@predictorf = PredictORF.new(id,descr,sequence,@trn_table)
|
127
|
+
@orflist = @predictorf.stopstop(0)
|
128
|
+
# @orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
129
|
+
# >EMBOSS_001_1
|
130
|
+
# IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSF
|
131
|
+
# S*IW*IPSLHIVCLSWASRCLSYLKIKGLDDAISFSSVKP
|
132
|
+
# >EMBOSS_001_2
|
133
|
+
# SLATPASSLSLQSSLLVDLVFSVQYQGLDLRWMRHQIQELFKELHRHP*LRFKRFQFSVS
|
134
|
+
# AESGRYHLYISYACHGLLDAFHT*RSKDLTMQ*ASRL*N
|
135
|
+
# >EMBOSS_001_3
|
136
|
+
# H*QHQLPLSRFKVHYSWISSSVYSIKGSICGG*DIRFRSFSKNCIDIRNFVSKDSNSQFQ
|
137
|
+
# LNLVDTIFTYRMLVMGF*MPFILKDQRT*RCNKLLVCKT
|
138
|
+
# >EMBOSS_001_4
|
139
|
+
# GFYRREAYCIVKSFDL*V*KASRSP*QAYDM*RWYLPDSAETENWNLLKRSYGCRCSSLK
|
140
|
+
# SS*I*CLIHRRSSP*YCTLKTRSTSSEL*SEREEAGVAND
|
141
|
+
# >EMBOSS_001_5
|
142
|
+
# VLQTRSLLHRQVL*SLSMKGI*KPMTSIRYVKMVSTRFS*N*ELESFETKLRMSMQFFEK
|
143
|
+
# LLNLMSHPPQIEPLILYTEDEIHE**TLKRERGSWCC**X
|
144
|
+
# >EMBOSS_001_6
|
145
|
+
# GFTDEKLIASSSPLIFKYERHLEAHDKHTICKDGIYQIQLKLRIGIF*NEVTDVDAVL*K
|
146
|
+
# APESDVSSTADRALDTVH*RRDPRVVNFEARERKLVLLMX
|
147
|
+
end
|
148
|
+
it "stopstop(0) should render ORFs" do
|
149
|
+
@orflist[0].aa.seq[0..3].should == "IISN"
|
150
|
+
@orflist[13].aa.seq[0..3].should == "GFYR"
|
151
|
+
@orflist[22].aa.seq[0..3].should == "VLQT"
|
152
|
+
end
|
153
|
+
it "stopstop(0) should render 33 reading frames and seven ORF" do
|
154
|
+
@orflist.size.should == 32
|
155
|
+
end
|
156
|
+
it "startstop(30) should render ORFs starting with a start codon" do
|
157
|
+
orflist = @predictorf.startstop(5)
|
158
|
+
# orflist.each do | orf | p [orf.descr,orf] end
|
159
|
+
orflist[0].aa.seq.should == "MPFILKDQRT"
|
160
|
+
orflist.size.should == 1
|
161
|
+
end
|
162
|
+
it "should never return an empty sequence" do
|
163
|
+
orflist = @predictorf.stopstop(0)
|
164
|
+
orflist.each do | orf |
|
165
|
+
orf.nt.seq.size.should >= 0
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should return 3 sequences when the minsize is 132" do
|
170
|
+
orflist = @predictorf.stopstop(44)
|
171
|
+
orflist.size.should == 4
|
172
|
+
end
|
173
|
+
|
174
|
+
it "should return 2 sequences when the minsize is 133" do
|
175
|
+
orflist = @predictorf.stopstop(45)
|
176
|
+
orflist.size.should == 3
|
177
|
+
end
|
178
|
+
|
179
|
+
it "should have -1 frame" do
|
180
|
+
sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
|
181
|
+
# >EMBOSS_001_4
|
182
|
+
# TRSTSSEL*SEREEAGVAN
|
183
|
+
predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
|
184
|
+
orflist = predictorf.stopstop(0)
|
185
|
+
# orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
186
|
+
orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
|
187
|
+
end
|
188
|
+
|
189
|
+
it "should correctly handle a sequence starting with a STOP codon" do
|
190
|
+
sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
|
191
|
+
# >EMBOSS_001_3
|
192
|
+
# *QHQLPLSRFKVHYSWIS
|
193
|
+
predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
|
194
|
+
orflist = predictorf.stopstop(0)
|
195
|
+
# orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
196
|
+
orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|