bio-bigbio 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +15 -0
- data/Gemfile.lock +34 -0
- data/LICENSE +34 -0
- data/README.rdoc +28 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/getorf +118 -0
- data/bin/nt2aa.rb +56 -0
- data/bio-bigbio.gemspec +102 -0
- data/doc/bigbio_getorf.wtex +14 -0
- data/lib/bigbio/adapters/translate.rb +64 -0
- data/lib/bigbio/db/blast/blastclust.rb +16 -0
- data/lib/bigbio/db/blast.rb +2 -0
- data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
- data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
- data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
- data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
- data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
- data/lib/bigbio/db/fasta/fastareader.rb +132 -0
- data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
- data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
- data/lib/bigbio/db/fasta/indexer.rb +33 -0
- data/lib/bigbio/db/fasta.rb +13 -0
- data/lib/bigbio/environment.rb +12 -0
- data/lib/bigbio/sequence/predictorf.rb +140 -0
- data/lib/bigbio/sequence/translate.rb +52 -0
- data/lib/bigbio.rb +38 -0
- data/spec/emitter_spec.rb +265 -0
- data/spec/predictorf_spec.rb +199 -0
- data/test/data/EMBOSS/EGC.1 +32 -0
- data/test/data/fasta/nt.fa +1000 -0
- data/test/doctest/test_fasta.rb +112 -0
- data/test/doctest/test_frames.rb +76 -0
- data/test/doctest/test_getorf.rb +154 -0
- data/test/doctest/test_paired.rb +55 -0
- data/test/performance/translate_with_biolib.rb +67 -0
- data/test/performance/translate_with_bioruby.rb +64 -0
- metadata +163 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
# Ruby DocTest
|
2
|
+
#
|
3
|
+
# BioBig FastaReader and FastaWriter implementation
|
4
|
+
#
|
5
|
+
# Run with ./runner.rb or
|
6
|
+
#
|
7
|
+
# ../../../biolib/tools/rubydoctest/bin/rubydoctest test_fasta.rb
|
8
|
+
#
|
9
|
+
# Documentation with rd2 -r rd/rd2html-lib *.rb
|
10
|
+
|
11
|
+
cwd = File.dirname(__FILE__)
|
12
|
+
Dir.chdir(cwd)
|
13
|
+
|
14
|
+
# $: << '../../../mappings/swig/ruby/biolib_core/'
|
15
|
+
# $: << '../../../mappings/swig/ruby/biolib_R/'
|
16
|
+
|
17
|
+
# require 'biolib/biolib_core'
|
18
|
+
|
19
|
+
# Biolib::Biolib_core.biolib_setloglevel(7)
|
20
|
+
|
21
|
+
if $UNITTEST
|
22
|
+
|
23
|
+
=begin
|
24
|
+
|
25
|
+
>> $: << '../../lib'
|
26
|
+
>> require 'bigbio'
|
27
|
+
>> TESTDIR = '../data/fasta'
|
28
|
+
>> nt_FILE = TESTDIR + "/nt.fa"
|
29
|
+
>> AA_FILE = TESTDIR + "/aa.fa"
|
30
|
+
|
31
|
+
BioLib's reference FASTA reader and writer contain a number of useful properties.
|
32
|
+
First RAM usage is limited. Second the interface is simple and to the point.
|
33
|
+
|
34
|
+
Open a reader and walk the file to find a FASTA record
|
35
|
+
|
36
|
+
!> include ::BioBig
|
37
|
+
>> nt_in = FastaReader.new(nt_FILE)
|
38
|
+
>> fastarec = nil
|
39
|
+
>> nt_in.each { | rec | fastarec = rec if rec.id =~ /-Arabidopsis_thaliana-126/ }
|
40
|
+
>> fastarec.id
|
41
|
+
=> "PUT-157a-Arabidopsis_thaliana-126"
|
42
|
+
|
43
|
+
Open a writer and write the record
|
44
|
+
|
45
|
+
>> nt_out = FastaWriter.new("nt.fa")
|
46
|
+
>> nt_out.write fastarec
|
47
|
+
>> nt_out.close
|
48
|
+
|
49
|
+
Cleanup
|
50
|
+
|
51
|
+
>> File.unlink("nt.fa")
|
52
|
+
|
53
|
+
The ID in the FASTA file can vary between implementations. Therefore a regex is
|
54
|
+
allowed to find the ID - the default is /^(\S+)/ - or the first non-white space
|
55
|
+
making up the tag. Here we grab the first relevant number
|
56
|
+
|
57
|
+
>> nt_in = FastaReader.new(nt_FILE, :regex => '(\d+)\s')
|
58
|
+
>> nt_in.each { | rec | fastarec = rec if rec.id == "126" }
|
59
|
+
>> fastarec.id
|
60
|
+
=> "126"
|
61
|
+
|
62
|
+
In the first examples the FastaReader parses the whole file on demand. When
|
63
|
+
we use the :index option an indexer is built up at the same time (or the
|
64
|
+
first time an indexing function is used). So
|
65
|
+
|
66
|
+
>> nt_in = FastaReader.new(nt_FILE, :regex => '(\d+)\s', :index => true)
|
67
|
+
>> rec = nt_in.get("122")
|
68
|
+
>> rec.id
|
69
|
+
=> "122"
|
70
|
+
|
71
|
+
Fetching more information from the FASTA file does not parse the whole file
|
72
|
+
again
|
73
|
+
|
74
|
+
>> nt_in.get("121").id
|
75
|
+
=> "121"
|
76
|
+
>> nt_in.get("121").seq
|
77
|
+
=> "CAATTTTTTAAACATTTACTGGTTACTAAATTTGGAGATAGTATCACATTTCTAAAGGGTAAGTTGGAAAATAAATTTACAGAAAAATTATAAGTATAAAAAGTATACAGATGGATTACTTAGACAGCAGCGGGTGTGGGGGCTGATGCGGAGTGGTCGTGGTCGAAGAAGGAACCCGGTTTCCGGGGGGATCTCAGTAAACTCGGAAAGAATGGCCCGAATTCGTCACCGCCAACTAGTTCTTCCTCGAGAAGCACTTCAACGAGCTTATCATGGGCTTCACGATTGTTCTTTATGTGGGTTAGAGCTATCTCATATGCACTGGCTGATAGTTTCTTCACCGGCAGAATCAATGTCTTCTGGAAGCTTCTCAGAAATGGAGTTTCTTGGCACCATCCT"
|
78
|
+
|
79
|
+
It is also possible to fetch a numbered record
|
80
|
+
|
81
|
+
>> rec = nt_in.get_by_index(0)
|
82
|
+
>> rec.id
|
83
|
+
=> "1"
|
84
|
+
>> rec.descr
|
85
|
+
=> "PUT-157a-Arabidopsis_thaliana-1\tPlantGDB-assembled Unique Transcript-fragment derived from Arabidopsis_thaliana mRNAs Jan_15_2007 (based on GenBank release 157)."
|
86
|
+
|
87
|
+
=end
|
88
|
+
|
89
|
+
|
90
|
+
$: << '../../lib'
|
91
|
+
require 'biobig'
|
92
|
+
require 'test/unit'
|
93
|
+
|
94
|
+
TESTDIR = '../../../test/data/fasta'
|
95
|
+
nt_FILE = TESTDIR + "/nt.fa"
|
96
|
+
AA_FILE = TESTDIR + "/aa.fa"
|
97
|
+
|
98
|
+
class TestBiolibFasta < Test::Unit::TestCase
|
99
|
+
|
100
|
+
def setup
|
101
|
+
end
|
102
|
+
|
103
|
+
def test_indexer
|
104
|
+
nt_in = FastaReader.new(nt_FILE, :regex => '(\d+)\s', :index => true)
|
105
|
+
rec = nt_in.get("122")
|
106
|
+
assert_equal("122",rec.id)
|
107
|
+
assert_equal("121",nt_in.get("121").id)
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end # $UNITTEST
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Ruby DocTest
|
2
|
+
#
|
3
|
+
# Translate a nucleotide sequence into six reading frames using the fast
|
4
|
+
# EMBOSS transeq function
|
5
|
+
#
|
6
|
+
# Run with ./runner.rb or
|
7
|
+
#
|
8
|
+
# env DATA=../../../biolib/src/test/data/emboss/ ruby ../../../biolib/tools/rubydoctest/bin/rubydoctest test_getorf.rb
|
9
|
+
#
|
10
|
+
# or
|
11
|
+
#
|
12
|
+
# ruby ../../../biolib/tools/rubydoctest/bin/rubydoctest test_getorf.rb
|
13
|
+
#
|
14
|
+
# Documentation with rd2 -r rd/rd2html-lib *.rb
|
15
|
+
|
16
|
+
cwd = File.dirname(__FILE__)
|
17
|
+
Dir.chdir(cwd)
|
18
|
+
|
19
|
+
$: << "../../lib"
|
20
|
+
|
21
|
+
if $UNITTEST
|
22
|
+
|
23
|
+
=begin
|
24
|
+
|
25
|
+
BibBio's frame translation uses the rapid Biolib::Emboss::transeq function to
|
26
|
+
translate Nucleotide sequences to Amino Acid sequences.
|
27
|
+
|
28
|
+
>> require 'bigbio/sequence/translate'
|
29
|
+
|
30
|
+
>> id = "PUT-157a-Arabidopsis_thaliana-126"
|
31
|
+
>> descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
32
|
+
>> sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
|
33
|
+
CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
|
34
|
+
TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
|
35
|
+
AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
|
36
|
+
CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
37
|
+
|
38
|
+
|
39
|
+
>> translate = Nucleotide::Translate.new(1)
|
40
|
+
>> list = translate.aa_frames(sequence)
|
41
|
+
|
42
|
+
We should have six frames
|
43
|
+
|
44
|
+
>> list.size
|
45
|
+
=> 6
|
46
|
+
|
47
|
+
>> aa = list.first
|
48
|
+
>> aa[:frame]
|
49
|
+
=> 1
|
50
|
+
|
51
|
+
This result matches the one from the EMBOSS web interface:
|
52
|
+
|
53
|
+
>> aa[:sequence]
|
54
|
+
=> "IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSFS*IW*IPSLHIVCLSWASRCLSYLKIKGLDDAISFSSVKP"
|
55
|
+
|
56
|
+
=end
|
57
|
+
|
58
|
+
|
59
|
+
$: << '..'
|
60
|
+
require 'db/fasta'
|
61
|
+
require 'test/unit'
|
62
|
+
|
63
|
+
TESTDIR = '../../../test/data/fasta'
|
64
|
+
AA_FILE = TESTDIR + "/aa.fa"
|
65
|
+
|
66
|
+
# class TestBiolibFasta < Test::Unit::TestCase
|
67
|
+
#
|
68
|
+
# def setup
|
69
|
+
# end
|
70
|
+
#
|
71
|
+
# def test_indexer
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# end
|
75
|
+
|
76
|
+
end # $UNITTEST
|
@@ -0,0 +1,154 @@
|
|
1
|
+
# Ruby DocTest
|
2
|
+
#
|
3
|
+
# Test getorf functionality.
|
4
|
+
#
|
5
|
+
# Run with ./runner.rb or
|
6
|
+
#
|
7
|
+
# env DATA=../data/EMBOSS/ ruby ../../../biolib/tools/rubydoctest/bin/rubydoctest test_getorf.rb
|
8
|
+
#
|
9
|
+
# Documentation with rd2 -r rd/rd2html-lib *.rb
|
10
|
+
|
11
|
+
cwd = File.dirname(__FILE__)
|
12
|
+
Dir.chdir(cwd)
|
13
|
+
|
14
|
+
$: << "../../lib"
|
15
|
+
|
16
|
+
if $UNITTEST
|
17
|
+
|
18
|
+
=begin
|
19
|
+
|
20
|
+
BigBio's ORF predictor uses the rapid Biolib::Emboss::transeq function to
|
21
|
+
translate Nucleotide sequences to Amino Acid sequences. Next it allows
|
22
|
+
several heuristics to select potential ORF's and returns them with their
|
23
|
+
reading frame and position in the nucleotide sequence. For example all
|
24
|
+
ORF's can be returned in the six reading frames, or simply the longest one.
|
25
|
+
|
26
|
+
One of the advantages of PredictORF is that it can return both the amino acid
|
27
|
+
and exactly matching nucleotide sequences which can be useful when calculating
|
28
|
+
dN/dS (or Ka/Ks) ratios, for example.
|
29
|
+
|
30
|
+
>> require 'bigbio/sequence/predictorf'
|
31
|
+
|
32
|
+
>> id = "PUT-157a-Arabidopsis_thaliana-126"
|
33
|
+
>> descr = "PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
34
|
+
>> sequence = "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT
|
35
|
+
CTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCT
|
36
|
+
TTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTC
|
37
|
+
AGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGC
|
38
|
+
CTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
39
|
+
|
40
|
+
Pick up the EMBOSS translation table:
|
41
|
+
|
42
|
+
>> trn_table = Biolib::Emboss.ajTrnNewI(1)
|
43
|
+
|
44
|
+
Initiate the ORF prediction class
|
45
|
+
|
46
|
+
>> predict = PredictORF.new(id,descr,sequence,trn_table)
|
47
|
+
|
48
|
+
The methods return a list of ORFCandidate ordered by sequence length. Here
|
49
|
+
we look for all ORF's between STOP codons (with a minimal size of 30 AA).
|
50
|
+
|
51
|
+
>> orflist = predict.stopstop
|
52
|
+
>> orflist.size
|
53
|
+
=> 9
|
54
|
+
|
55
|
+
Get the first (and largest) ORF
|
56
|
+
|
57
|
+
>> orf = orflist.first
|
58
|
+
|
59
|
+
The id contains the number of the ORF at the last position (like EMBOSS'
|
60
|
+
getorf does)
|
61
|
+
|
62
|
+
>> orf.id
|
63
|
+
=> "PUT-157a-Arabidopsis_thaliana-126_1"
|
64
|
+
|
65
|
+
The description contains 'XX' for the STOPSTOP search. Unlike getorf it shows
|
66
|
+
the reading frame.
|
67
|
+
|
68
|
+
>> orf.descr
|
69
|
+
=> "[XX +1 0 - 183; 183/300] PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
70
|
+
>> orf.aa.seq.size
|
71
|
+
=> 61
|
72
|
+
>> orf.aa.seq
|
73
|
+
=> "IISNTSFLSLASKFTTRGSRLQCTVSRARSAVDETSDSGAFQRTASTSVTSFQKIPILSFS"
|
74
|
+
|
75
|
+
The ORF object contains more information:
|
76
|
+
|
77
|
+
>> orf.nt.start
|
78
|
+
=> 0
|
79
|
+
>> orf.frame
|
80
|
+
=> 1
|
81
|
+
|
82
|
+
The matching sequence with the AA
|
83
|
+
|
84
|
+
>> orf.nt.seq
|
85
|
+
=> "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGC"
|
86
|
+
|
87
|
+
And it keeps track of the full nucleotide sequence
|
88
|
+
|
89
|
+
>> orf.nt.fullseq
|
90
|
+
=> "ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGTCTTCAGTGTACAGTATCAAGGGCTCGATCTGCGGTGGATGAGACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCTAGATGCCTTTCATACTTAAAGATCAAAGGACTTGACGATGCAATAAGCTTCTCGTCTGTAAAACCC"
|
91
|
+
|
92
|
+
Let's check one of the others
|
93
|
+
|
94
|
+
>> orf = orflist[3]
|
95
|
+
>> orf.frame
|
96
|
+
=> 3
|
97
|
+
>> orf.nt.start
|
98
|
+
=> 101
|
99
|
+
>> orf.nt.stop
|
100
|
+
=> 233
|
101
|
+
>> orf.aa.seq
|
102
|
+
=> "DIRFRSFSKNCIDIRNFVSKDSNSQFQLNLVDTIFTYRMLVMGF"
|
103
|
+
>> orf.nt.seq
|
104
|
+
=> "GACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTC"
|
105
|
+
>> orf.nt.fullseq[orf.nt.start..orf.nt.stop]
|
106
|
+
=> "GACATCAGATTCAGGAGCTTTTCAAAGAACTGCATCGACATCCGTAACTTCGTTTCAAAAGATTCCAATTCTCAGTTTCAGCTGAATCTGGTAGATACCATCTTTACATATCGTATGCTTGTCATGGGCTTCT"
|
107
|
+
|
108
|
+
Naming for each ORF
|
109
|
+
|
110
|
+
>> orf.id
|
111
|
+
=> "PUT-157a-Arabidopsis_thaliana-126_6"
|
112
|
+
>> orf.descr
|
113
|
+
=> "[XX +3 101 - 233; 132/300] PlantGDB Arabidopsis_thaliana Jan_15_2007"
|
114
|
+
|
115
|
+
The ORF are sorted by size, so if you want to know the size of the smallest ORF
|
116
|
+
|
117
|
+
>> orflist.last.aa.seq.size
|
118
|
+
=> 30
|
119
|
+
|
120
|
+
STOPSTOP (the stopstop method above) is just one heuristic. You can use
|
121
|
+
startstop to get a list of ORF's with START codon:
|
122
|
+
|
123
|
+
>> orflist = predict.startstop
|
124
|
+
>> orflist.size
|
125
|
+
=> 0
|
126
|
+
|
127
|
+
Another one is to get the longest likely ORF with
|
128
|
+
|
129
|
+
>> longest = predict.longest_startstop
|
130
|
+
>> longest.aa.seq.size
|
131
|
+
=> 21
|
132
|
+
|
133
|
+
|
134
|
+
=end
|
135
|
+
|
136
|
+
|
137
|
+
$: << '..'
|
138
|
+
require 'db/fasta'
|
139
|
+
require 'test/unit'
|
140
|
+
|
141
|
+
TESTDIR = '../../../test/data/fasta'
|
142
|
+
AA_FILE = TESTDIR + "/aa.fa"
|
143
|
+
|
144
|
+
# class TestBiolibFasta < Test::Unit::TestCase
|
145
|
+
#
|
146
|
+
# def setup
|
147
|
+
# end
|
148
|
+
#
|
149
|
+
# def test_indexer
|
150
|
+
# end
|
151
|
+
#
|
152
|
+
# end
|
153
|
+
|
154
|
+
end # $UNITTEST
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Ruby DocTest
|
2
|
+
#
|
3
|
+
# Test paired nt+AA sequence functiontlity.
|
4
|
+
#
|
5
|
+
# Run with ./runner.rb or
|
6
|
+
#
|
7
|
+
# ../../../../tools/rubydoctest/bin/rubydoctest test_paired.rb
|
8
|
+
#
|
9
|
+
# Documentation with rd2 -r rd/rd2html-lib *.rb
|
10
|
+
|
11
|
+
cwd = File.dirname(__FILE__)
|
12
|
+
Dir.chdir(cwd)
|
13
|
+
|
14
|
+
# $: << '../../../mappings/swig/ruby/rqtl/'
|
15
|
+
|
16
|
+
# require 'biolib/biolib_core'
|
17
|
+
# Biolib::Biolib_core.biolib_setloglevel(7)
|
18
|
+
|
19
|
+
if $UNITTEST
|
20
|
+
|
21
|
+
=begin
|
22
|
+
|
23
|
+
>> $: << '..'
|
24
|
+
!> require 'bio/sequence2'
|
25
|
+
|
26
|
+
Sequence pairs are paired NT and AA sequences where one can be tested against
|
27
|
+
the other (through translation) and an nt sequence can be aligned against an AA
|
28
|
+
alignment (protein alignment to nucleotide alignment, also known as pal2ntl).
|
29
|
+
|
30
|
+
=end
|
31
|
+
|
32
|
+
|
33
|
+
$: << '..'
|
34
|
+
require 'db/fasta'
|
35
|
+
require 'test/unit'
|
36
|
+
|
37
|
+
TESTDIR = '../../../test/data/fasta'
|
38
|
+
nt_FILE = TESTDIR + "/nt.fa"
|
39
|
+
AA_FILE = TESTDIR + "/aa.fa"
|
40
|
+
|
41
|
+
class TestBiolibFasta < Test::Unit::TestCase
|
42
|
+
|
43
|
+
def setup
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_indexer
|
47
|
+
nt_in = FastaReader.new(nt_FILE, :regex => '(\d+)\s', :index => true)
|
48
|
+
rec = nt_in.get("122")
|
49
|
+
assert_equal("122",rec.id)
|
50
|
+
assert_equal("121",nt_in.get("121").id)
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end # $UNITTEST
|
@@ -0,0 +1,67 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Performance testing routing for translating a FASTA file into six
|
4
|
+
# reading frames using the Biolib (EMBOSS) routines.
|
5
|
+
#
|
6
|
+
# by Pjotr Prins (c) 2009
|
7
|
+
#
|
8
|
+
# To reduce the impact of file IO you can run multiple iterations using
|
9
|
+
# a command line switch.
|
10
|
+
#
|
11
|
+
# Usage:
|
12
|
+
#
|
13
|
+
# ruby -Ipath_to_biolib translate_with_biolib.rb --iter 10 nucleotides.fasta
|
14
|
+
#
|
15
|
+
# Example:
|
16
|
+
#
|
17
|
+
# time ruby -I~/izip/git/opensource/biolib/lib/ translate_with_bioruby.rb --iter 10 ../data/fasta/nt.fa > test.out
|
18
|
+
#
|
19
|
+
# Renders on my machine:
|
20
|
+
#
|
21
|
+
# 96 records 5760 times translated!
|
22
|
+
# real 0m0.290s
|
23
|
+
# user 0m0.252s
|
24
|
+
# sys 0m0.024s
|
25
|
+
#
|
26
|
+
# with a large file
|
27
|
+
#
|
28
|
+
# 22929 records 137574 times translated!
|
29
|
+
# real 0m20.306s
|
30
|
+
# user 0m15.997s
|
31
|
+
# sys 0m1.344s
|
32
|
+
|
33
|
+
|
34
|
+
$: << '../../lib'
|
35
|
+
|
36
|
+
require 'biolib/emboss'
|
37
|
+
require 'bigbio'
|
38
|
+
|
39
|
+
iter=1
|
40
|
+
fn = ARGV.shift
|
41
|
+
|
42
|
+
if fn == '--iter'
|
43
|
+
iter = ARGV.shift.to_i
|
44
|
+
fn = ARGV.shift
|
45
|
+
end
|
46
|
+
|
47
|
+
nt = FastaReader.new(fn)
|
48
|
+
trnTable = Biolib::Emboss.ajTrnNewI(1);
|
49
|
+
|
50
|
+
nt.each { | rec |
|
51
|
+
(0..iter).each do | repeat |
|
52
|
+
ajpseq = Biolib::Emboss.ajSeqNewNameC(rec.seq,"Test sequence")
|
53
|
+
|
54
|
+
[-3,-2,-1,1,2,3].each do | frame |
|
55
|
+
ajpseqt = Biolib::Emboss.ajTrnSeqOrig(trnTable,ajpseq,frame)
|
56
|
+
aa = Biolib::Emboss.ajSeqGetSeqCopyC(ajpseqt)
|
57
|
+
print "> ",rec.id," ",frame.to_s,"\n"
|
58
|
+
print aa,"\n"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
}
|
62
|
+
|
63
|
+
$stderr.print nt.size," records ",nt.size*6*iter," times translated!"
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Performance testing routing for translating a FASTA file into six
|
4
|
+
# reading frames using the Bioruby routines.
|
5
|
+
#
|
6
|
+
# by Pjotr Prins (c) 2009
|
7
|
+
#
|
8
|
+
# To reduce the impact of file IO you can run multiple iterations using
|
9
|
+
# a command line switch.
|
10
|
+
#
|
11
|
+
# Usage:
|
12
|
+
#
|
13
|
+
# ruby -Ipath_to_bioruby translate_with_bioruby.rb --iter 10 nucleotides.fasta
|
14
|
+
#
|
15
|
+
# Example:
|
16
|
+
#
|
17
|
+
# time ruby -I~/izip/git/opensource/bioruby/lib/ translate_with_bioruby.rb --iter 10 ../data/fasta/nt.fa > test.out
|
18
|
+
#
|
19
|
+
# Renders on my machine:
|
20
|
+
#
|
21
|
+
# 96 records 5760 times translated!
|
22
|
+
# real 0m6.414s
|
23
|
+
# user 0m5.928s
|
24
|
+
# sys 0m0.384s
|
25
|
+
#
|
26
|
+
# with a large file
|
27
|
+
#
|
28
|
+
# 22929 records 137574 times translated!
|
29
|
+
# real 9m30.952s
|
30
|
+
# user 8m42.877s
|
31
|
+
# sys 0m32.878s
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
$: << '../../lib'
|
36
|
+
|
37
|
+
require 'bio'
|
38
|
+
require 'bigbio'
|
39
|
+
|
40
|
+
iter=1
|
41
|
+
fn = ARGV.shift
|
42
|
+
|
43
|
+
if fn == '--iter'
|
44
|
+
iter = ARGV.shift.to_i
|
45
|
+
fn = ARGV.shift
|
46
|
+
end
|
47
|
+
|
48
|
+
nt = FastaReader.new(fn)
|
49
|
+
|
50
|
+
nt.each { | rec |
|
51
|
+
(0..iter).each do | repeat |
|
52
|
+
seq = Bio::Sequence::NA.new(rec.seq)
|
53
|
+
[-3,-2,-1,1,2,3].each do | frame |
|
54
|
+
print "> ",rec.id," ",frame.to_s,"\n"
|
55
|
+
print seq.translate(frame),"\n"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
}
|
59
|
+
|
60
|
+
$stderr.print nt.size," records ",nt.size*6*iter," times translated!"
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
metadata
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-bigbio
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Pjotr Prins
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-30 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bio
|
16
|
+
requirement: &12808920 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.4.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *12808920
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bio-logger
|
27
|
+
requirement: &12807920 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.9.0
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *12807920
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &12806940 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 2.3.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *12806940
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: bundler
|
49
|
+
requirement: &12780320 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.0.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *12780320
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: jeweler
|
60
|
+
requirement: &12779320 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 1.5.2
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *12779320
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rcov
|
71
|
+
requirement: &12778120 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *12778120
|
80
|
+
description: Fasta reader, ORF emitter, sequence translation
|
81
|
+
email: pjotr.public01@thebird.nl
|
82
|
+
executables:
|
83
|
+
- getorf
|
84
|
+
- nt2aa.rb
|
85
|
+
extensions: []
|
86
|
+
extra_rdoc_files:
|
87
|
+
- LICENSE
|
88
|
+
- README.rdoc
|
89
|
+
files:
|
90
|
+
- Gemfile
|
91
|
+
- Gemfile.lock
|
92
|
+
- LICENSE
|
93
|
+
- README.rdoc
|
94
|
+
- Rakefile
|
95
|
+
- VERSION
|
96
|
+
- bin/getorf
|
97
|
+
- bin/nt2aa.rb
|
98
|
+
- bio-bigbio.gemspec
|
99
|
+
- doc/bigbio_getorf.wtex
|
100
|
+
- lib/bigbio.rb
|
101
|
+
- lib/bigbio/adapters/translate.rb
|
102
|
+
- lib/bigbio/db/blast.rb
|
103
|
+
- lib/bigbio/db/blast/blastclust.rb
|
104
|
+
- lib/bigbio/db/emitters/fasta_emitter.rb
|
105
|
+
- lib/bigbio/db/emitters/orf_emitter.rb
|
106
|
+
- lib/bigbio/db/fasta.rb
|
107
|
+
- lib/bigbio/db/fasta/fastaindex.rb
|
108
|
+
- lib/bigbio/db/fasta/fastapairedreader.rb
|
109
|
+
- lib/bigbio/db/fasta/fastapairedwriter.rb
|
110
|
+
- lib/bigbio/db/fasta/fastareader.rb
|
111
|
+
- lib/bigbio/db/fasta/fastarecord.rb
|
112
|
+
- lib/bigbio/db/fasta/fastawriter.rb
|
113
|
+
- lib/bigbio/db/fasta/indexer.rb
|
114
|
+
- lib/bigbio/environment.rb
|
115
|
+
- lib/bigbio/sequence/predictorf.rb
|
116
|
+
- lib/bigbio/sequence/translate.rb
|
117
|
+
- spec/emitter_spec.rb
|
118
|
+
- spec/predictorf_spec.rb
|
119
|
+
- test/data/EMBOSS/EGC.1
|
120
|
+
- test/data/fasta/nt.fa
|
121
|
+
- test/doctest/test_fasta.rb
|
122
|
+
- test/doctest/test_frames.rb
|
123
|
+
- test/doctest/test_getorf.rb
|
124
|
+
- test/doctest/test_paired.rb
|
125
|
+
- test/performance/translate_with_biolib.rb
|
126
|
+
- test/performance/translate_with_bioruby.rb
|
127
|
+
homepage: http://github.com/pjotrp/bioruby-bigbioruby
|
128
|
+
licenses:
|
129
|
+
- MIT
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
none: false
|
136
|
+
requirements:
|
137
|
+
- - ! '>='
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0'
|
140
|
+
segments:
|
141
|
+
- 0
|
142
|
+
hash: -3977311402144334707
|
143
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
|
+
none: false
|
145
|
+
requirements:
|
146
|
+
- - ! '>='
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
149
|
+
requirements: []
|
150
|
+
rubyforge_project:
|
151
|
+
rubygems_version: 1.8.10
|
152
|
+
signing_key:
|
153
|
+
specification_version: 3
|
154
|
+
summary: Low memory sequence emitters
|
155
|
+
test_files:
|
156
|
+
- spec/emitter_spec.rb
|
157
|
+
- spec/predictorf_spec.rb
|
158
|
+
- test/doctest/test_fasta.rb
|
159
|
+
- test/doctest/test_frames.rb
|
160
|
+
- test/doctest/test_getorf.rb
|
161
|
+
- test/doctest/test_paired.rb
|
162
|
+
- test/performance/translate_with_biolib.rb
|
163
|
+
- test/performance/translate_with_bioruby.rb
|