bio-bigbio 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +15 -0
- data/Gemfile.lock +34 -0
- data/LICENSE +34 -0
- data/README.rdoc +28 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/getorf +118 -0
- data/bin/nt2aa.rb +56 -0
- data/bio-bigbio.gemspec +102 -0
- data/doc/bigbio_getorf.wtex +14 -0
- data/lib/bigbio/adapters/translate.rb +64 -0
- data/lib/bigbio/db/blast/blastclust.rb +16 -0
- data/lib/bigbio/db/blast.rb +2 -0
- data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
- data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
- data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
- data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
- data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
- data/lib/bigbio/db/fasta/fastareader.rb +132 -0
- data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
- data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
- data/lib/bigbio/db/fasta/indexer.rb +33 -0
- data/lib/bigbio/db/fasta.rb +13 -0
- data/lib/bigbio/environment.rb +12 -0
- data/lib/bigbio/sequence/predictorf.rb +140 -0
- data/lib/bigbio/sequence/translate.rb +52 -0
- data/lib/bigbio.rb +38 -0
- data/spec/emitter_spec.rb +265 -0
- data/spec/predictorf_spec.rb +199 -0
- data/test/data/EMBOSS/EGC.1 +32 -0
- data/test/data/fasta/nt.fa +1000 -0
- data/test/doctest/test_fasta.rb +112 -0
- data/test/doctest/test_frames.rb +76 -0
- data/test/doctest/test_getorf.rb +154 -0
- data/test/doctest/test_paired.rb +55 -0
- data/test/performance/translate_with_biolib.rb +67 -0
- data/test/performance/translate_with_bioruby.rb +64 -0
- metadata +163 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
# ORF predictor class
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'bigbio/sequence/translate'
|
5
|
+
|
6
|
+
class ORFsequence
|
7
|
+
attr_accessor :seq
|
8
|
+
def initialize sequence
|
9
|
+
@seq = sequence
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Helper class for storing ORF information
|
14
|
+
class ORFnucleotides < ORFsequence
|
15
|
+
attr_reader :start, :stop
|
16
|
+
def initialize sequence, start, stop
|
17
|
+
super(sequence)
|
18
|
+
@start = start
|
19
|
+
@stop = stop
|
20
|
+
end
|
21
|
+
|
22
|
+
def seq
|
23
|
+
@seq[@start..@stop-1]
|
24
|
+
end
|
25
|
+
|
26
|
+
def fullseq
|
27
|
+
@seq
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
seq
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
# Helper class for storing ORF information
|
37
|
+
class ORFaminoacids < ORFsequence
|
38
|
+
end
|
39
|
+
|
40
|
+
class ORF
|
41
|
+
attr_reader :id, :descr, :nt, :aa, :frame
|
42
|
+
def initialize num, type, id, descr, nt, frame, start, aa
|
43
|
+
@id = id.to_s + '_' + (num + 1).to_s
|
44
|
+
# ---- adjust start to match frame
|
45
|
+
start += frame.abs-1
|
46
|
+
# ---- stop should not go beyond sequence
|
47
|
+
stop = start + aa.size * 3
|
48
|
+
if stop > nt.size
|
49
|
+
stop = nt.size
|
50
|
+
end
|
51
|
+
# ---- if frame < 0 it should reverse complement
|
52
|
+
if frame < 0
|
53
|
+
nt = Bio::Sequence::NA.new(nt).reverse_complement.to_s.upcase
|
54
|
+
end
|
55
|
+
# p [start, stop, stop-start]
|
56
|
+
# p nt
|
57
|
+
fr = frame.to_s
|
58
|
+
fr = '+'+fr if frame > 0
|
59
|
+
@descr = "[#{type} #{fr} #{start} - #{stop}; #{stop-start}/#{nt.size}] " + descr
|
60
|
+
@nt = ORFnucleotides.new(nt, start, stop)
|
61
|
+
@frame = frame
|
62
|
+
@aa = ORFaminoacids.new(aa)
|
63
|
+
end
|
64
|
+
|
65
|
+
def <=> other
|
66
|
+
if frame == other.frame
|
67
|
+
nt.seq <=> other.nt.seq
|
68
|
+
else
|
69
|
+
frame <=> other.frame
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_fastarec
|
74
|
+
aa = FastaRecord.new(@id,@descr,@aa.seq)
|
75
|
+
nt = FastaRecord.new(@id,@descr,@nt.seq)
|
76
|
+
FastaPairedRecord.new(nt,aa)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class PredictORF
|
81
|
+
|
82
|
+
include Bio::Big::FrameCodonHelpers
|
83
|
+
|
84
|
+
def initialize id, descr, seq, trn_table
|
85
|
+
@id = id
|
86
|
+
@descr = descr
|
87
|
+
@seq = seq.gsub(/\s/,'')
|
88
|
+
@trn_table = trn_table
|
89
|
+
@startcodons = # FIXME: this should be linked properly
|
90
|
+
if trn_table == nil or trn_table == 0
|
91
|
+
START_CODONS
|
92
|
+
else # prokaryote
|
93
|
+
['ATG','TTG','CTG','AUG','UUG','CUG']
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return a list of predicted ORFs with :minsize AA's. The ORF's
|
98
|
+
# are between STOP codons (so sequences without a proper START codon
|
99
|
+
# are included)
|
100
|
+
def stopstop minsize=30
|
101
|
+
type = "XX"
|
102
|
+
orfs = []
|
103
|
+
translate = Nucleotide::Translate.new(@trn_table)
|
104
|
+
aa_frames = translate.aa_6_frames(@seq)
|
105
|
+
# p @seq
|
106
|
+
# pp aa_frames
|
107
|
+
num = 0
|
108
|
+
aa_frames.each do | aa_frame |
|
109
|
+
frame = aa_frame[:frame]
|
110
|
+
aa = aa_frame[:sequence]
|
111
|
+
aa_start = 0
|
112
|
+
aa.split(/\*/).each do | candidate |
|
113
|
+
if candidate.size >= minsize and candidate.size > 0
|
114
|
+
orf = ORF.new(num,type,@id,@descr,@seq,frame,aa_start*3,candidate)
|
115
|
+
orfs.push orf
|
116
|
+
num += 1
|
117
|
+
end
|
118
|
+
aa_start += candidate.size + 1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
orfs
|
122
|
+
end
|
123
|
+
|
124
|
+
# Return a list of predicted ORFs with :minsize AA's. The ORF's
|
125
|
+
# are between START and STOP codons (ATG, TTG, CTG and AUG, UUG and CUG for
|
126
|
+
# now, a later version should use the EMBOSS translation table).
|
127
|
+
def startstop minsize=30
|
128
|
+
stopstop(minsize).find_all { | orf |
|
129
|
+
# p [orf.nt.seq[0..2].upcase,@startcodons.include?(orf.nt.seq[0..2].upcase)]
|
130
|
+
@startcodons.include?(orf.nt.seq[0..2].upcase)
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the longest ORF that has a START codon (see +startstop+)
|
135
|
+
# Returns nil if none is found
|
136
|
+
def longest_startstop minsize=0
|
137
|
+
startstop(minsize).first
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
module Nucleotide
|
3
|
+
|
4
|
+
module TranslationTable
|
5
|
+
end
|
6
|
+
|
7
|
+
class Translate
|
8
|
+
|
9
|
+
include Bio::Big::TranslationAdapter
|
10
|
+
|
11
|
+
# Table can be either an id (integer) or a Biolib::Emboss TrnTable
|
12
|
+
|
13
|
+
def initialize table
|
14
|
+
table = 0 if table == nil
|
15
|
+
if table.kind_of? Numeric
|
16
|
+
@trn_table = Bio::Big::TranslationAdapter.translation_table(table)
|
17
|
+
else
|
18
|
+
@trn_table = table
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return all six reading frames as an Array - ordered as
|
23
|
+
# frames [1,2,3,-1,-2,-3] with as tuples [frame, AAsequence].
|
24
|
+
#
|
25
|
+
# Note that the nucleotide sequence does not get modified.
|
26
|
+
|
27
|
+
def aa_6_frames seq
|
28
|
+
res = []
|
29
|
+
# remove white space
|
30
|
+
seq = seq.gsub(/\s/,'')
|
31
|
+
[1,2,3,-1,-2,-3].each do | frame |
|
32
|
+
aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
|
33
|
+
res.push({:frame => frame, :sequence => aa})
|
34
|
+
end
|
35
|
+
res
|
36
|
+
end
|
37
|
+
# Return all forward reading frames as an Array - ordered as
|
38
|
+
# frames [1,2,3] with as tuples [frame, AAsequence]
|
39
|
+
|
40
|
+
def aa_forward_frames seq
|
41
|
+
res = []
|
42
|
+
# remove white space
|
43
|
+
seq = seq.gsub(/\s/,'')
|
44
|
+
[1,2,3].each do | frame |
|
45
|
+
aa = Bio::Big::TranslationAdapter.translate(@trn_table,frame,seq)
|
46
|
+
res.push({:frame => frame, :sequence => aa})
|
47
|
+
end
|
48
|
+
res
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
data/lib/bigbio.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# BigBio libraries
|
2
|
+
|
3
|
+
require 'bigbio/environment'
|
4
|
+
|
5
|
+
# find local plugin installation, and use it when there
|
6
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
7
|
+
bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
|
8
|
+
if File.directory? bio_logger_path
|
9
|
+
$: << bio_logger_path
|
10
|
+
$stderr.print "bio-logger loaded directly\n"
|
11
|
+
else
|
12
|
+
require "rubygems"
|
13
|
+
gem "bio-logger"
|
14
|
+
end
|
15
|
+
require 'bio-logger'
|
16
|
+
|
17
|
+
log = Bio::Log::LoggerPlus.new('bigbio')
|
18
|
+
Bio::Big::Environment.instance.log = log
|
19
|
+
|
20
|
+
begin
|
21
|
+
require 'biolib/emboss'
|
22
|
+
Bio::Big::Environment.instance.biolib = true
|
23
|
+
rescue LoadError
|
24
|
+
log.outputters = Bio::Log::Outputter.stderr
|
25
|
+
log.warn "BioLib functionality not loaded"
|
26
|
+
end
|
27
|
+
|
28
|
+
require 'bio'
|
29
|
+
require 'bigbio/adapters/translate'
|
30
|
+
require 'bigbio/db/emitters/fasta_emitter'
|
31
|
+
require 'bigbio/db/emitters/orf_emitter'
|
32
|
+
|
33
|
+
autoload :FastaReader, 'bigbio/db/fasta'
|
34
|
+
autoload :FastaWriter, 'bigbio/db/fasta'
|
35
|
+
autoload :FastaPairedReader, 'bigbio/db/fasta'
|
36
|
+
autoload :FastaPairedWriter, 'bigbio/db/fasta'
|
37
|
+
autoload :BlastClust, 'bigbio/db/blast'
|
38
|
+
autoload :PredictORF, 'bigbio/sequence/predictorf'
|
@@ -0,0 +1,265 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
|
4
|
+
$: << "../lib"
|
5
|
+
|
6
|
+
require 'bigbio'
|
7
|
+
|
8
|
+
describe Bio::Big::FastaEmitter, "when using the emitter" do
|
9
|
+
include Bio::Big
|
10
|
+
|
11
|
+
it "should emit small parts" do
|
12
|
+
s = ""
|
13
|
+
FastaEmitter.new("test/data/fasta/nt.fa",10).emit_seq do | part, index, tag, seq |
|
14
|
+
# p [index, part, tag, seq]
|
15
|
+
s += seq
|
16
|
+
if index == 95 and part == :tail
|
17
|
+
s.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
18
|
+
end
|
19
|
+
s = "" if part == :tail
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should emit large parts" do
|
24
|
+
FastaEmitter.new("test/data/fasta/nt.fa").emit_seq do | part, index, tag, seq |
|
25
|
+
# p [index, part, tag, seq]
|
26
|
+
if index == 95
|
27
|
+
seq.should == "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe Bio::Big::ShortFrameState, "when using the ShortFrameState" do
|
34
|
+
|
35
|
+
include Bio::Big
|
36
|
+
|
37
|
+
it "should find an ORF" do
|
38
|
+
fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,0
|
39
|
+
orfs = fr.get_stopstop_orfs
|
40
|
+
orfs.map{ | orf | orf.pos }.should == [ 3, 5 ]
|
41
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
42
|
+
orfs = fr.get_startstop_orfs
|
43
|
+
orfs.map{ | orf | orf.pos }.should == [ 0, 3 ]
|
44
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTAA","ATGTAA"]
|
45
|
+
end
|
46
|
+
it "should handle min_size" do
|
47
|
+
fr = ShortFrameState.new "atggattaaatgtaatggatttaatgtaaa",0,9
|
48
|
+
orfs = fr.get_stopstop_orfs
|
49
|
+
orfs.map{ | orf | orf.to_seq }.should == [ "TGGATTTAA"]
|
50
|
+
orfs.map{ | orf | orf.pos }.should == [ 5 ]
|
51
|
+
fr.get_startstop_orfs.should == []
|
52
|
+
end
|
53
|
+
it "should find ORFs in" do
|
54
|
+
fr = ShortFrameState.new "atgttttaaatgtaatgttgttaaatgttttaaatgtaatgttgttaa",0,0
|
55
|
+
orfs = fr.get_stopstop_orfs
|
56
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGTTGTTAA", "ATGTTTTAA", "ATGTAA", "TGTTGTTAA"]
|
57
|
+
orfs.map{ | orf | orf.pos }.should == [3, 5, 8, 11, 13]
|
58
|
+
orfs = fr.get_startstop_orfs
|
59
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTTTTAA", "ATGTAA", "ATGTTTTAA", "ATGTAA"]
|
60
|
+
orfs.map{ | orf | orf.pos }.should == [ 0, 3, 8, 11]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should match results of EMBOSS getorf" do
|
64
|
+
s = "AG GTTCGNACGGTCATCGNATNAAGTCTTGNATATCG TAA TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTG GGG AAAACTTTG TGA GCAAAGAGCGAGAAAATGAGCGGANCGG TAA GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC TAA GGTCCTTGTCACAGATGAGGCTCGTAGAG".gsub(/ /,'')
|
65
|
+
# >_3 [3 - 167] #0
|
66
|
+
# 1st GTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTTCT
|
67
|
+
# CCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCA
|
68
|
+
# GTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAA
|
69
|
+
# >_4 [171 - 179] #0
|
70
|
+
# OK GCGGANCGG
|
71
|
+
# >_5 [183 - 239] #0
|
72
|
+
# OK GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGC
|
73
|
+
# >_6 [243 - 257] #0
|
74
|
+
# OK- GGTCCTTGTCACAGA
|
75
|
+
# >_7 [261 - 266] #0
|
76
|
+
# OK- GGCTCGTAG
|
77
|
+
# >_8 [1 - 270] # 1
|
78
|
+
# whole! AGGTTCGNACGGTCATCGNATNAAGTCTTGNATATCGTAATTNCGCGTGCCGCCTTCTTT
|
79
|
+
# CTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGT
|
80
|
+
# CAGTGTGTGTATTTTGGGGAAAACTTTGTGAGCAAAGAGCGAGAAAATGAGCGGANCGGT
|
81
|
+
# AAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCT
|
82
|
+
# AAGGTCCTTGTCACAGATGAGGCTCGTAGA
|
83
|
+
# >_1 [2 - 37] #2
|
84
|
+
# 1st- GGTTCGNACGGTCATCGNATNAAGTCTTGNATATCG
|
85
|
+
# >_2 [41 - 148] #2
|
86
|
+
# OK- TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCT
|
87
|
+
# TCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTG
|
88
|
+
# >_9 [152 - 271] #2
|
89
|
+
# last- CAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTT
|
90
|
+
# CAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
|
91
|
+
|
92
|
+
# Frame 0
|
93
|
+
minsize = 0
|
94
|
+
fr = ShortFrameState.new s,0,minsize
|
95
|
+
orfs = fr.get_stopstop_orfs
|
96
|
+
orfs.map{ | orf | orf.to_seq }.should == []
|
97
|
+
|
98
|
+
# Frame 1
|
99
|
+
fr = ShortFrameState.new s[1..-1],0,minsize
|
100
|
+
os = fr.get_stopstop_orfs
|
101
|
+
os.map{ | orf | orf.to_seq }.should == ["TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"]
|
102
|
+
orfs += os
|
103
|
+
# Frame 2
|
104
|
+
fr = ShortFrameState.new s[2..-1],0,minsize
|
105
|
+
os = fr.get_stopstop_orfs
|
106
|
+
os.map{ | orf | orf.to_seq }.should == ["GCGGANCGGTAA", "GAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAA", "GGTCCTTGTCACAGATGA", "GGCTCGTAG"]
|
107
|
+
orfs += os
|
108
|
+
orfs.size.should == 5
|
109
|
+
|
110
|
+
# >_1 [235 - 270]
|
111
|
+
# Last: ATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGA
|
112
|
+
# >_2 [167 - 271]
|
113
|
+
# Last: ATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATT
|
114
|
+
# GGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAG
|
115
|
+
|
116
|
+
# Frame 0
|
117
|
+
minsize = 0
|
118
|
+
fr = ShortFrameState.new s,0,minsize
|
119
|
+
orfs = fr.get_startstop_orfs
|
120
|
+
|
121
|
+
# Frame 1
|
122
|
+
fr = ShortFrameState.new s[1..-1],0,minsize
|
123
|
+
orfs += fr.get_startstop_orfs
|
124
|
+
# Frame 2
|
125
|
+
fr = ShortFrameState.new s[2..-1],0,minsize
|
126
|
+
orfs += fr.get_startstop_orfs
|
127
|
+
orfs.map{ | orf | orf.to_seq }.should == []
|
128
|
+
orfs.size.should == 0
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe Bio::Big::ShortFrameState, "when combining frames" do
|
133
|
+
include Bio::Big
|
134
|
+
it "should combine a forward frame" do
|
135
|
+
s1 = "atggattaaatgtaata"
|
136
|
+
s2 = "atggatttaatgtaaa"
|
137
|
+
fr = ShortFrameState.new s1,0,0
|
138
|
+
fr.ntseq_pos.should == 0
|
139
|
+
orfs = fr.get_stopstop_orfs
|
140
|
+
orfs.size == 1 # in codons
|
141
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
142
|
+
fr3.ntseq_pos.should == 15
|
143
|
+
fr3.codons.to_seq.should == "TAATGGATTTAATGTAAA"
|
144
|
+
norfs = fr3.get_stopstop_orfs
|
145
|
+
orfs += norfs
|
146
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
147
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,18]
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should combine a forward frame without ORFs in first seq" do
|
151
|
+
s1 = "atggattaaatgta"
|
152
|
+
# ......---===xx
|
153
|
+
s2 = "atggatttaattattataaa"
|
154
|
+
# x======xxx======xxx.
|
155
|
+
fr = ShortFrameState.new s1,0,0
|
156
|
+
fr.ntseq_pos.should == 0
|
157
|
+
orfs = fr.get_stopstop_orfs
|
158
|
+
orfs.size == 0 # in codons
|
159
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
160
|
+
fr3.ntseq_pos.should == 0
|
161
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATTATTATAA"
|
162
|
+
norfs = fr3.get_stopstop_orfs
|
163
|
+
orfs = norfs
|
164
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA","TGGATTTAA","TTATTATAA"]
|
165
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6,9+6+9]
|
166
|
+
end
|
167
|
+
|
168
|
+
it "should combine a forward frame without ORFs in first seq" do
|
169
|
+
s1 = "atggattaaatgta"
|
170
|
+
# ......---===xx
|
171
|
+
s2 = "atggatttaatgtaaa"
|
172
|
+
# x======xxx
|
173
|
+
fr = ShortFrameState.new s1,0,0
|
174
|
+
fr.ntseq_pos.should == 0
|
175
|
+
orfs = fr.get_stopstop_orfs
|
176
|
+
orfs.size == 0 # in codons
|
177
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_right(fr,orfs,s2)
|
178
|
+
# p fr3
|
179
|
+
fr3.ntseq_pos.should == 0 # on the combined sequences
|
180
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTAATGGATTTAATGTAAA"
|
181
|
+
norfs = fr3.get_stopstop_orfs
|
182
|
+
orfs += norfs
|
183
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGTAA", "TGGATTTAA"]
|
184
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [9,9+6]
|
185
|
+
end
|
186
|
+
|
187
|
+
it "should combine a reverse frame" do
|
188
|
+
# Reversed (real locations on contig):
|
189
|
+
#
|
190
|
+
# | 3 21 B |
|
191
|
+
# ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
|
192
|
+
# ...^--============xxx^=======xxx
|
193
|
+
# ^ ^
|
194
|
+
# Actual feed:
|
195
|
+
#
|
196
|
+
# s2= s1=
|
197
|
+
# 18 0 (ntseq_pos)
|
198
|
+
# "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
|
199
|
+
# ......xxx===== ~===xx^============--^...
|
200
|
+
# 0 1 2 3 0 1 2 3
|
201
|
+
|
202
|
+
s2 = "tatttaaatggatttaatgtaaatt"
|
203
|
+
# ~===xx^============--^...
|
204
|
+
s1 = "atggattaaatgta"
|
205
|
+
# ......xxx=====
|
206
|
+
# now move the other way, as sequences get emitted on the left
|
207
|
+
fr = ShortReversedFrameState.new s2,0,0
|
208
|
+
# p fr
|
209
|
+
fr.codons.to_seq.should == "ATTTAAATGGATTTAATGTAAATT"
|
210
|
+
fr.ntseq_pos.should == 0
|
211
|
+
orfs = fr.get_stopstop_orfs
|
212
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA"]
|
213
|
+
orfs.first.pos.should == 2 # in codons
|
214
|
+
fr3 = FrameCodonHelpers::CreateShortFrame.create_left(fr,orfs,s1)
|
215
|
+
fr3.ntseq_pos.should == 18 # 6 codons
|
216
|
+
fr3.codons.to_seq.should == "ATGGATTAAATGTATATTTAA"
|
217
|
+
norfs = fr3.get_stopstop_orfs
|
218
|
+
orfs += norfs
|
219
|
+
orfs.map{ | orf | orf.to_seq }.should == ["ATGGATTTAATGTAA", "ATGTATATTTAA"]
|
220
|
+
orfs.map{ | orf | orf.pos }.should == [2,3]
|
221
|
+
orfs.map{ | orf | orf.track_ntseq_pos }.should == [6,18+9]
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
describe Bio::Big::OrfEmitter, "when using the ORF emitter" do
|
227
|
+
include Bio::Big
|
228
|
+
|
229
|
+
it "should emit STOP-STOP ORFs in all frames" do
|
230
|
+
f = FastaEmitter.new("test/data/fasta/nt.fa")
|
231
|
+
seqs = []
|
232
|
+
OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
|
233
|
+
break if index != 0
|
234
|
+
if frame == 0 and index == 0 and pos == 39
|
235
|
+
seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
|
236
|
+
# p [frame,index, tag, pos, seq]
|
237
|
+
end
|
238
|
+
if index == 0
|
239
|
+
seqs.push seq
|
240
|
+
end
|
241
|
+
end
|
242
|
+
seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
|
243
|
+
end
|
244
|
+
it "should emit STOP-STOP ORFs in all frames using a shorter emitter" do
|
245
|
+
f = FastaEmitter.new("test/data/fasta/nt.fa",150)
|
246
|
+
seqs = []
|
247
|
+
OrfEmitter.new(f,:stopstop)::emit_seq do | frame, index, tag, pos, seq |
|
248
|
+
break if index != 0
|
249
|
+
if frame == 0 and index == 0 and pos == 39
|
250
|
+
seq.should == "TTNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA"
|
251
|
+
# p [frame,index, tag, pos, seq]
|
252
|
+
end
|
253
|
+
if index == 0
|
254
|
+
seqs.push seq
|
255
|
+
end
|
256
|
+
end
|
257
|
+
# seqs.join(';')[50..350].should == "TNCGCGTGCCGCCTTCTTTCTCCTTTTTCTCTTTTACTTCTTCATCATCATCTTCTTCTTCTTCCTCTTCGATATTCGTCAGTGTGTGTATTTTGGGGAAAACTTTGTGA;GCAAAGAGCGAGAAAATGAGCGGANCGGTAAGAAAATCGCGGATGTGGCTTTCAAAGCTTCAAGGACTATCGATTGGGATGGTATGGCTAAGGTCCTTGTCACAGATGAGGCTCGTAGAGAGTTCTCTAACCTTCGTCGTGCTTTCGATGAGGTTAACACACAGCTCCAGACCAAATTTAGTCAGGACCT"
|
258
|
+
end
|
259
|
+
if false
|
260
|
+
it "should emit START-STOP ORFs in all frames"
|
261
|
+
it "should emit ORFs on any filter"
|
262
|
+
it "should emit ORFs using a minimum size"
|
263
|
+
it "should emit ORFs with adjoining sequences"
|
264
|
+
end
|
265
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
|
2
|
+
$: << "../lib"
|
3
|
+
ENV['DATA'] = '../test/data/EMBOSS'
|
4
|
+
|
5
|
+
require 'bigbio'
|
6
|
+
|
7
|
+
# Note that PredictORF, at this point, leaves trailing X's for the AA sequence
|
8
|
+
|
9
|
+
describe PredictORF, " when using a short simple nucleotide sequence" do
|
10
|
+
before :all do
|
11
|
+
# initialize
|
12
|
+
id = 'test'
|
13
|
+
descr = 'Test'
|
14
|
+
# sequence = 'AGCTGAATCTGGTAGATACCATCTTTAA'
|
15
|
+
sequence = 'AGCTGAATCTGG'
|
16
|
+
# trn_table = Biolib::Emboss.ajTrnNewI(1)
|
17
|
+
trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
18
|
+
|
19
|
+
@predictorf = PredictORF.new(id,descr,sequence,trn_table)
|
20
|
+
@orflist = @predictorf.stopstop(0)
|
21
|
+
# @orflist.each do | orf | p [orf.descr,orf] end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "stopstop(0) should render six reading frames and seven ORF" do
|
25
|
+
# >EMBOSS_001_1
|
26
|
+
# S*IW
|
27
|
+
# >EMBOSS_001_2
|
28
|
+
# AESG
|
29
|
+
# >EMBOSS_001_3
|
30
|
+
# LNLX
|
31
|
+
# >EMBOSS_001_4
|
32
|
+
# PDSA
|
33
|
+
# >EMBOSS_001_5
|
34
|
+
# RFSX
|
35
|
+
# >EMBOSS_001_6
|
36
|
+
# QIQL
|
37
|
+
@orflist[0].aa.seq.should == "S"
|
38
|
+
@orflist[3].aa.seq[0..2].should == "LNL"
|
39
|
+
@orflist[4].aa.seq[0..2].should == "PDS"
|
40
|
+
@orflist.size.should == 7
|
41
|
+
end
|
42
|
+
|
43
|
+
# frame +1 - 4 codons S*IW
|
44
|
+
it "should give a first valid +1 frame" do
|
45
|
+
orf = @orflist[1]
|
46
|
+
orf.frame.should == 1
|
47
|
+
orf.nt.start.should == 6
|
48
|
+
orf.aa.seq.should == "IW"
|
49
|
+
orf.nt.seq.should == "ATCTGG"
|
50
|
+
end
|
51
|
+
|
52
|
+
# frame +1 - 4 codons S*IW
|
53
|
+
it "should give a second valid +1 frame" do
|
54
|
+
orf = @orflist[0]
|
55
|
+
orf.frame.should equal 1
|
56
|
+
orf.nt.start.should equal 0
|
57
|
+
orf.aa.seq.should == "S"
|
58
|
+
orf.nt.seq.should == "AGC"
|
59
|
+
end
|
60
|
+
|
61
|
+
# frame +2 - 3 codons AES
|
62
|
+
it "should give a valid +2 frame" do
|
63
|
+
orf = @orflist[2]
|
64
|
+
# pp @orflist
|
65
|
+
# pp orf
|
66
|
+
orf.nt.seq[0..8].should == "GCTGAATCT"
|
67
|
+
orf.frame.should == 2
|
68
|
+
orf.nt.start.should == 1
|
69
|
+
# orf.nt.stop.should == 12 - EMBOS differs
|
70
|
+
orf.aa.seq[0..2].should == "AES"
|
71
|
+
end
|
72
|
+
|
73
|
+
# frame +3 - 3 codons LNL
|
74
|
+
it "should give a valid +3 frame" do
|
75
|
+
orf = @orflist[3]
|
76
|
+
orf.frame.should == 3
|
77
|
+
orf.nt.start.should == 2
|
78
|
+
# orf.nt.stop.should == 12
|
79
|
+
orf.aa.seq[0..3].should == "LNL"
|
80
|
+
orf.nt.seq[0..9].should == "CTGAATCTG"
|
81
|
+
end
|
82
|
+
|
83
|
+
# frame -1 - 4 codons PDSA
|
84
|
+
it "should give a valid -1 frame" do
|
85
|
+
orf = @orflist[4]
|
86
|
+
orf.frame.should == -1
|
87
|
+
orf.nt.start.should == 0
|
88
|
+
orf.nt.stop.should == 12
|
89
|
+
orf.aa.seq.should == "PDSA"
|
90
|
+
orf.nt.seq.should == "CCAGATTCAGCT"
|
91
|
+
end
|
92
|
+
|
93
|
+
# frame -2 - 3 codons RFSX
|
94
|
+
it "should give a valid -3 frame" do
|
95
|
+
orf = @orflist[5]
|
96
|
+
orf.frame.should == -2
|
97
|
+
orf.nt.start.should == 1
|
98
|
+
# orf.nt.stop.should == 12
|
99
|
+
orf.aa.seq[0..2].should == "RFS"
|
100
|
+
orf.nt.seq[0..8].should == "CAGATTCAG"
|
101
|
+
end
|
102
|
+
|
103
|
+
# frame -3 - 3 codons QIQL
|
104
|
+
it "should give a valid -3 frame" do
|
105
|
+
orf = @orflist[6]
|
106
|
+
orf.frame.should == -3
|
107
|
+
orf.nt.start.should == 2
|
108
|
+
# orf.nt.stop.should == 12
|
109
|
+
orf.aa.seq[0..2].should == "QIQ"
|
110
|
+
orf.nt.seq[0..8].should == "AGATTCAGC"
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe PredictORF, " when using a more complicated nucleotide sequence" do
|
115
|
+
before :all do
|
116
|
+
# initialize
|
117
|
+
id = "PUT-157a-Arabidopsis_thaliana-126"
|
118
|
+
descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
119
|
+
sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
|
120
|
+
CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
|
121
|
+
TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
|
122
|
+
AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
|
123
|
+
CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
124
|
+
# @trn_table = Biolib::Emboss.ajTrnNewI(1)
|
125
|
+
@trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
126
|
+
@predictorf = PredictORF.new(id,descr,sequence,@trn_table)
|
127
|
+
@orflist = @predictorf.stopstop(0)
|
128
|
+
# @orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
129
|
+
# >EMBOSS_001_1
|
130
|
+
# IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSF
|
131
|
+
# S*IW*IPSLHIVCLSWASRCLSYLKIKGLDDAISFSSVKP
|
132
|
+
# >EMBOSS_001_2
|
133
|
+
# SLATPASSLSLQSSLLVDLVFSVQYQGLDLRWMRHQIQELFKELHRHP*LRFKRFQFSVS
|
134
|
+
# AESGRYHLYISYACHGLLDAFHT*RSKDLTMQ*ASRL*N
|
135
|
+
# >EMBOSS_001_3
|
136
|
+
# H*QHQLPLSRFKVHYSWISSSVYSIKGSICGG*DIRFRSFSKNCIDIRNFVSKDSNSQFQ
|
137
|
+
# LNLVDTIFTYRMLVMGF*MPFILKDQRT*RCNKLLVCKT
|
138
|
+
# >EMBOSS_001_4
|
139
|
+
# GFYRREAYCIVKSFDL*V*KASRSP*QAYDM*RWYLPDSAETENWNLLKRSYGCRCSSLK
|
140
|
+
# SS*I*CLIHRRSSP*YCTLKTRSTSSEL*SEREEAGVAND
|
141
|
+
# >EMBOSS_001_5
|
142
|
+
# VLQTRSLLHRQVL*SLSMKGI*KPMTSIRYVKMVSTRFS*N*ELESFETKLRMSMQFFEK
|
143
|
+
# LLNLMSHPPQIEPLILYTEDEIHE**TLKRERGSWCC**X
|
144
|
+
# >EMBOSS_001_6
|
145
|
+
# GFTDEKLIASSSPLIFKYERHLEAHDKHTICKDGIYQIQLKLRIGIF*NEVTDVDAVL*K
|
146
|
+
# APESDVSSTADRALDTVH*RRDPRVVNFEARERKLVLLMX
|
147
|
+
end
|
148
|
+
it "stopstop(0) should render ORFs" do
|
149
|
+
@orflist[0].aa.seq[0..3].should == "IISN"
|
150
|
+
@orflist[13].aa.seq[0..3].should == "GFYR"
|
151
|
+
@orflist[22].aa.seq[0..3].should == "VLQT"
|
152
|
+
end
|
153
|
+
it "stopstop(0) should render 33 reading frames and seven ORF" do
|
154
|
+
@orflist.size.should == 32
|
155
|
+
end
|
156
|
+
it "startstop(30) should render ORFs starting with a start codon" do
|
157
|
+
orflist = @predictorf.startstop(5)
|
158
|
+
# orflist.each do | orf | p [orf.descr,orf] end
|
159
|
+
orflist[0].aa.seq.should == "MPFILKDQRT"
|
160
|
+
orflist.size.should == 1
|
161
|
+
end
|
162
|
+
it "should never return an empty sequence" do
|
163
|
+
orflist = @predictorf.stopstop(0)
|
164
|
+
orflist.each do | orf |
|
165
|
+
orf.nt.seq.size.should >= 0
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should return 3 sequences when the minsize is 132" do
|
170
|
+
orflist = @predictorf.stopstop(44)
|
171
|
+
orflist.size.should == 4
|
172
|
+
end
|
173
|
+
|
174
|
+
it "should return 2 sequences when the minsize is 133" do
|
175
|
+
orflist = @predictorf.stopstop(45)
|
176
|
+
orflist.size.should == 3
|
177
|
+
end
|
178
|
+
|
179
|
+
it "should have -1 frame" do
|
180
|
+
sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
|
181
|
+
# >EMBOSS_001_4
|
182
|
+
# TRSTSSEL*SEREEAGVAN
|
183
|
+
predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
|
184
|
+
orflist = predictorf.stopstop(0)
|
185
|
+
# orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
186
|
+
orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
|
187
|
+
end
|
188
|
+
|
189
|
+
it "should correctly handle a sequence starting with a STOP codon" do
|
190
|
+
sequence = "ATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT"
|
191
|
+
# >EMBOSS_001_3
|
192
|
+
# *QHQLPLSRFKVHYSWIS
|
193
|
+
predictorf = PredictORF.new('test','TEST',sequence,@trn_table)
|
194
|
+
orflist = predictorf.stopstop(0)
|
195
|
+
# orflist.each_with_index do | orf,i | p [i,orf.descr,orf.aa.seq,orf.nt.seq] end
|
196
|
+
orflist[2].aa.seq[0..18].should == "QHQLPLSRFKVHYSWIS"
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|