bio-bigbio 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +15 -0
- data/Gemfile.lock +34 -0
- data/LICENSE +34 -0
- data/README.rdoc +28 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/getorf +118 -0
- data/bin/nt2aa.rb +56 -0
- data/bio-bigbio.gemspec +102 -0
- data/doc/bigbio_getorf.wtex +14 -0
- data/lib/bigbio/adapters/translate.rb +64 -0
- data/lib/bigbio/db/blast/blastclust.rb +16 -0
- data/lib/bigbio/db/blast.rb +2 -0
- data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
- data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
- data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
- data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
- data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
- data/lib/bigbio/db/fasta/fastareader.rb +132 -0
- data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
- data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
- data/lib/bigbio/db/fasta/indexer.rb +33 -0
- data/lib/bigbio/db/fasta.rb +13 -0
- data/lib/bigbio/environment.rb +12 -0
- data/lib/bigbio/sequence/predictorf.rb +140 -0
- data/lib/bigbio/sequence/translate.rb +52 -0
- data/lib/bigbio.rb +38 -0
- data/spec/emitter_spec.rb +265 -0
- data/spec/predictorf_spec.rb +199 -0
- data/test/data/EMBOSS/EGC.1 +32 -0
- data/test/data/fasta/nt.fa +1000 -0
- data/test/doctest/test_fasta.rb +112 -0
- data/test/doctest/test_frames.rb +76 -0
- data/test/doctest/test_getorf.rb +154 -0
- data/test/doctest/test_paired.rb +55 -0
- data/test/performance/translate_with_biolib.rb +67 -0
- data/test/performance/translate_with_bioruby.rb +64 -0
- metadata +163 -0
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "bio", ">= 1.4.1"
|
6
|
+
gem "bio-logger", ">= 0.9.0"
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "rspec", "~> 2.3.0"
|
12
|
+
gem "bundler", "~> 1.0.0"
|
13
|
+
gem "jeweler", "~> 1.5.2"
|
14
|
+
gem "rcov", ">= 0"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
bio (1.4.1)
|
5
|
+
bio-logger (0.9.0)
|
6
|
+
log4r (>= 1.1.9)
|
7
|
+
diff-lcs (1.1.2)
|
8
|
+
git (1.2.5)
|
9
|
+
jeweler (1.5.2)
|
10
|
+
bundler (~> 1.0.0)
|
11
|
+
git (>= 1.2.5)
|
12
|
+
rake
|
13
|
+
log4r (1.1.9)
|
14
|
+
rake (0.8.7)
|
15
|
+
rcov (0.9.9)
|
16
|
+
rspec (2.3.0)
|
17
|
+
rspec-core (~> 2.3.0)
|
18
|
+
rspec-expectations (~> 2.3.0)
|
19
|
+
rspec-mocks (~> 2.3.0)
|
20
|
+
rspec-core (2.3.1)
|
21
|
+
rspec-expectations (2.3.0)
|
22
|
+
diff-lcs (~> 1.1.2)
|
23
|
+
rspec-mocks (2.3.0)
|
24
|
+
|
25
|
+
PLATFORMS
|
26
|
+
ruby
|
27
|
+
|
28
|
+
DEPENDENCIES
|
29
|
+
bio (>= 1.4.1)
|
30
|
+
bio-logger (>= 0.9.0)
|
31
|
+
bundler (~> 1.0.0)
|
32
|
+
jeweler (~> 1.5.2)
|
33
|
+
rcov
|
34
|
+
rspec (~> 2.3.0)
|
data/LICENSE
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
If a license is not specified the code contributed to BioBig defaults to the
|
2
|
+
BSD license:
|
3
|
+
|
4
|
+
Copyright (c) 2008, 2009 The BioLib Project
|
5
|
+
All rights reserved.
|
6
|
+
|
7
|
+
Redistribution and use in source and binary forms, with or without
|
8
|
+
modification, are permitted provided that the following conditions are met:
|
9
|
+
|
10
|
+
* Redistributions of source code must retain the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer.
|
12
|
+
|
13
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
14
|
+
this list of conditions and the following disclaimer in the documentation
|
15
|
+
and/or other materials provided with the distribution.
|
16
|
+
|
17
|
+
* Neither the name of the The BioLib Project nor the names of
|
18
|
+
its contributors may be used to endorse or promote products derived from
|
19
|
+
this software without specific prior written permission.
|
20
|
+
|
21
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
22
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
23
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
24
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
25
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
26
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
27
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
28
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
29
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
30
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
31
|
+
|
32
|
+
For more information on opensource software licenses see
|
33
|
+
http://www.opensource.org/licenses/bsd-license.php,
|
34
|
+
http://www.gnu.org/licenses/gpl.html and http://www.fsf.org/.
|
data/README.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= BIGBIO
|
2
|
+
|
3
|
+
BigBio = BIG DATA for Ruby
|
4
|
+
|
5
|
+
BigBio is an initiative to a create high performance libraries for big data
|
6
|
+
computing in biology.
|
7
|
+
|
8
|
+
BigBio may use BioLib C/C++/D functions for increasing performance and
|
9
|
+
reducing memory consumption.
|
10
|
+
|
11
|
+
This is an experimental project. If you wish to contribute subscribe
|
12
|
+
to the BioRuby and/or BioLib mailing lists.
|
13
|
+
|
14
|
+
== Functionality
|
15
|
+
|
16
|
+
* BigBio can translate nucleotide sequences to amino acid
|
17
|
+
sequences using an EMBOSS C function, or BioRuby's translator.
|
18
|
+
|
19
|
+
* BigBio has an ORF emitter which parses DNA/RNA sequences and emits
|
20
|
+
ORFs between START_STOP or STOP_STOP codons.
|
21
|
+
|
22
|
+
* BigBio has a FASTA file emitter, with iterates FASTA files and
|
23
|
+
iterates sequences without loading everything in memory.
|
24
|
+
|
25
|
+
== Copyright
|
26
|
+
|
27
|
+
Copyright (c) 2011-2012 Pjotr Prins. See LICENSE for further details.
|
28
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "bio-bigbio"
|
16
|
+
gem.homepage = "http://github.com/pjotrp/bioruby-bigbioruby"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{Low memory sequence emitters}
|
19
|
+
gem.description = %Q{Fasta reader, ORF emitter, sequence translation}
|
20
|
+
gem.email = "pjotr.public01@thebird.nl"
|
21
|
+
gem.authors = ["Pjotr Prins"]
|
22
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
23
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
24
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
25
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
26
|
+
end
|
27
|
+
Jeweler::RubygemsDotOrgTasks.new
|
28
|
+
|
29
|
+
require 'rspec/core'
|
30
|
+
require 'rspec/core/rake_task'
|
31
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
32
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
33
|
+
end
|
34
|
+
|
35
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
36
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
37
|
+
spec.rcov = true
|
38
|
+
end
|
39
|
+
|
40
|
+
task :default => :spec
|
41
|
+
|
42
|
+
require 'rake/rdoctask'
|
43
|
+
Rake::RDocTask.new do |rdoc|
|
44
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
45
|
+
|
46
|
+
rdoc.rdoc_dir = 'rdoc'
|
47
|
+
rdoc.title = "bio-bigbio #{version}"
|
48
|
+
rdoc.rdoc_files.include('README*')
|
49
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
50
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
data/bin/getorf
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Predict ORF's from nucleotide sequences using the BigBio predictors.
|
4
|
+
# The input is a fasta file, the output consists of
|
5
|
+
# a FASTA amino acid sequence file with matching nucleotide sequences
|
6
|
+
# (aa_heuristic.fa and nt_heuristic.fa respectively)
|
7
|
+
#
|
8
|
+
# You can choose the heuristic on the command line (default stopstop).
|
9
|
+
#
|
10
|
+
# Author:: Pjotr Prins
|
11
|
+
# Copyright:: 2009-2011
|
12
|
+
# License:: Ruby License
|
13
|
+
#
|
14
|
+
# Copyright (C) 2009-2011 Pjotr Prins <pjotr.prins@thebird.nl>
|
15
|
+
|
16
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
17
|
+
$: << File.join(rootpath,'lib')
|
18
|
+
|
19
|
+
BIGBIO_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
|
20
|
+
|
21
|
+
USAGE =<<EOM
|
22
|
+
ruby #{__FILE__} [-h stopstop] [--min-size 30] inputfile(s)
|
23
|
+
|
24
|
+
Use --help for more options
|
25
|
+
|
26
|
+
EOM
|
27
|
+
|
28
|
+
# require 'biolib/emboss'
|
29
|
+
require 'bigbio'
|
30
|
+
require 'optparse'
|
31
|
+
|
32
|
+
$stderr.print "getorf BioRuby BigBio Plugin "+BIGBIO_VERSION+" Copyright (C) 2009-2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
|
33
|
+
|
34
|
+
Bio::Log::CLI.logger('stderr')
|
35
|
+
Bio::Log::CLI.trace('info')
|
36
|
+
|
37
|
+
heuristic = 'stopstop'
|
38
|
+
minsize = 30
|
39
|
+
longest_match = false
|
40
|
+
|
41
|
+
opts = OptionParser.new() { |opts|
|
42
|
+
opts.on_tail("-?", "--help", "Print this message") {
|
43
|
+
print(USAGE)
|
44
|
+
print(opts)
|
45
|
+
print <<EXAMPLE
|
46
|
+
|
47
|
+
EXAMPLE
|
48
|
+
exit()
|
49
|
+
}
|
50
|
+
|
51
|
+
opts.on("-h heuristic", String, "Heuristic (stopstop)") do | s |
|
52
|
+
heuristic = s
|
53
|
+
end
|
54
|
+
opts.on("-s size", "--min-size", Integer, "Minimal sequence size") do | n |
|
55
|
+
minsize = n
|
56
|
+
end
|
57
|
+
opts.on("--longest", "Only get longest ORF match") do
|
58
|
+
longest_match = true
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
62
|
+
Bio::Log::CLI.logger(name)
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("--trace options",String,"Set log level (default INFO, see bio-logger)
|
66
|
+
") do | s |
|
67
|
+
Bio::Log::CLI.trace(s)
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on("-q", "--quiet", "Run quietly") do |q|
|
71
|
+
Bio::Log::CLI.trace('error')
|
72
|
+
end
|
73
|
+
|
74
|
+
opts.on("-v", "--verbose", "Run verbosely") do |v|
|
75
|
+
Bio::Log::CLI.trace('info')
|
76
|
+
end
|
77
|
+
|
78
|
+
opts.on("--debug", "Show debug messages") do |v|
|
79
|
+
Bio::Log::CLI.trace('debug')
|
80
|
+
options.debug = true
|
81
|
+
end
|
82
|
+
}
|
83
|
+
opts.parse!(ARGV)
|
84
|
+
if ARGV.size == 0
|
85
|
+
print USAGE
|
86
|
+
exit 1
|
87
|
+
end
|
88
|
+
|
89
|
+
Bio::Log::CLI.configure('bigbio')
|
90
|
+
|
91
|
+
# print "Heuristic is #{heuristic}\n"
|
92
|
+
# print "Minsize #{minsize}\n"
|
93
|
+
|
94
|
+
ARGV.each do | fn |
|
95
|
+
raise "File #{fn} does not exist" if !File.exist?(fn)
|
96
|
+
nt = Bio::Big::FastaEmitter.new(fn)
|
97
|
+
|
98
|
+
include Bio::Big::TranslationAdapter
|
99
|
+
trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
100
|
+
|
101
|
+
id = 0
|
102
|
+
nt.emit_seq do | where, location, tag, seq |
|
103
|
+
id += 1
|
104
|
+
# p [where, location, tag, seq]
|
105
|
+
predict = PredictORF.new(id,tag,seq,trn_table)
|
106
|
+
orflist = predict.send(heuristic,minsize)
|
107
|
+
orflist.each do | orf |
|
108
|
+
print '>',orf.descr,"\n"
|
109
|
+
print orf.nt.to_s,"\n"
|
110
|
+
# p orf
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
|
data/bin/nt2aa.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Translate nucleotide sequences into aminoacids sequences in all
|
4
|
+
# reading frames.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# (: pjotrp 2009, 2012 rblicense :)
|
8
|
+
#
|
9
|
+
# Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>
|
10
|
+
|
11
|
+
USAGE =<<EOM
|
12
|
+
ruby #{__FILE__} [--six-frame] inputfile(s)
|
13
|
+
EOM
|
14
|
+
|
15
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
16
|
+
|
17
|
+
require 'bigbio'
|
18
|
+
|
19
|
+
if ARGV.size < 1
|
20
|
+
print USAGE
|
21
|
+
exit 1
|
22
|
+
end
|
23
|
+
|
24
|
+
do_sixframes = false
|
25
|
+
frames = [1]
|
26
|
+
if ARGV[0] == '--six-frame'
|
27
|
+
ARGV.shift!
|
28
|
+
do_sixframes = true
|
29
|
+
frames = [-3,-2,-1,1,2,3]
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'bigbio/adapters/translate'
|
33
|
+
|
34
|
+
ARGV.each do | fn |
|
35
|
+
raise "File #{fn} does not exist" if !File.exist?(fn)
|
36
|
+
nt = FastaReader.new(fn)
|
37
|
+
trn_table = Bio::Big::TranslationAdapter.translation_table(1)
|
38
|
+
|
39
|
+
nt.each { | rec |
|
40
|
+
ajpseq = Bio::Big::TranslationAdapter.pre_translate(rec.seq,"Test sequence")
|
41
|
+
|
42
|
+
frames.each do | frame |
|
43
|
+
aa = Bio::Big::TranslationAdapter.translate(trn_table,frame, rec.seq, ajpseq)
|
44
|
+
|
45
|
+
# ajpseqt = Biolib::Emboss.ajTrnSeqOrig(trnTable,ajpseq,frame)
|
46
|
+
# aa = Biolib::Emboss.ajSeqGetSeqCopyC(ajpseqt)
|
47
|
+
print "> ",rec.descr," [",frame.to_s,"]\n"
|
48
|
+
print aa,"\n"
|
49
|
+
end
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
data/bio-bigbio.gemspec
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "bio-bigbio"
|
8
|
+
s.version = "0.1.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = "2012-01-30"
|
13
|
+
s.description = "Fasta reader, ORF emitter, sequence translation"
|
14
|
+
s.email = "pjotr.public01@thebird.nl"
|
15
|
+
s.executables = ["getorf", "nt2aa.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"bin/getorf",
|
28
|
+
"bin/nt2aa.rb",
|
29
|
+
"bio-bigbio.gemspec",
|
30
|
+
"doc/bigbio_getorf.wtex",
|
31
|
+
"lib/bigbio.rb",
|
32
|
+
"lib/bigbio/adapters/translate.rb",
|
33
|
+
"lib/bigbio/db/blast.rb",
|
34
|
+
"lib/bigbio/db/blast/blastclust.rb",
|
35
|
+
"lib/bigbio/db/emitters/fasta_emitter.rb",
|
36
|
+
"lib/bigbio/db/emitters/orf_emitter.rb",
|
37
|
+
"lib/bigbio/db/fasta.rb",
|
38
|
+
"lib/bigbio/db/fasta/fastaindex.rb",
|
39
|
+
"lib/bigbio/db/fasta/fastapairedreader.rb",
|
40
|
+
"lib/bigbio/db/fasta/fastapairedwriter.rb",
|
41
|
+
"lib/bigbio/db/fasta/fastareader.rb",
|
42
|
+
"lib/bigbio/db/fasta/fastarecord.rb",
|
43
|
+
"lib/bigbio/db/fasta/fastawriter.rb",
|
44
|
+
"lib/bigbio/db/fasta/indexer.rb",
|
45
|
+
"lib/bigbio/environment.rb",
|
46
|
+
"lib/bigbio/sequence/predictorf.rb",
|
47
|
+
"lib/bigbio/sequence/translate.rb",
|
48
|
+
"spec/emitter_spec.rb",
|
49
|
+
"spec/predictorf_spec.rb",
|
50
|
+
"test/data/EMBOSS/EGC.1",
|
51
|
+
"test/data/fasta/nt.fa",
|
52
|
+
"test/doctest/test_fasta.rb",
|
53
|
+
"test/doctest/test_frames.rb",
|
54
|
+
"test/doctest/test_getorf.rb",
|
55
|
+
"test/doctest/test_paired.rb",
|
56
|
+
"test/performance/translate_with_biolib.rb",
|
57
|
+
"test/performance/translate_with_bioruby.rb"
|
58
|
+
]
|
59
|
+
s.homepage = "http://github.com/pjotrp/bioruby-bigbioruby"
|
60
|
+
s.licenses = ["MIT"]
|
61
|
+
s.require_paths = ["lib"]
|
62
|
+
s.rubygems_version = "1.8.10"
|
63
|
+
s.summary = "Low memory sequence emitters"
|
64
|
+
s.test_files = [
|
65
|
+
"spec/emitter_spec.rb",
|
66
|
+
"spec/predictorf_spec.rb",
|
67
|
+
"test/doctest/test_fasta.rb",
|
68
|
+
"test/doctest/test_frames.rb",
|
69
|
+
"test/doctest/test_getorf.rb",
|
70
|
+
"test/doctest/test_paired.rb",
|
71
|
+
"test/performance/translate_with_biolib.rb",
|
72
|
+
"test/performance/translate_with_bioruby.rb"
|
73
|
+
]
|
74
|
+
|
75
|
+
if s.respond_to? :specification_version then
|
76
|
+
s.specification_version = 3
|
77
|
+
|
78
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
79
|
+
s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])
|
80
|
+
s.add_runtime_dependency(%q<bio-logger>, [">= 0.9.0"])
|
81
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
82
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
83
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
84
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
85
|
+
else
|
86
|
+
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
87
|
+
s.add_dependency(%q<bio-logger>, [">= 0.9.0"])
|
88
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
89
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
90
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
91
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
92
|
+
end
|
93
|
+
else
|
94
|
+
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
95
|
+
s.add_dependency(%q<bio-logger>, [">= 0.9.0"])
|
96
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
97
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
98
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
99
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
== ORF Prediction ==
|
3
|
+
|
4
|
+
The ORF 'predictor' is of the simple kind. It detects ORFs by
|
5
|
+
identifying start and stop signals in the same frame.
|
6
|
+
|
7
|
+
The main feature is that it does not consume real memory through the
|
8
|
+
use of a Sequence 'emitter', which scans a large input sequence and
|
9
|
+
yields open reading frames from STOP to STOP codon, with adjoining
|
10
|
+
nucleotides. This allows scanning the sequence in a *single* pass.
|
11
|
+
|
12
|
+
The input file maybe Fasta, as long as the reader yields chunks of
|
13
|
+
sequence data.
|
14
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# TranslationAdapter will translate using EMBOSS, or BioRuby
|
2
|
+
# when the first is not available
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module Big
|
6
|
+
module TranslationAdapter
|
7
|
+
|
8
|
+
VALID_FRAME_VALUES = [ 0, -1, -2, -3, 1, 2, 3 ]
|
9
|
+
|
10
|
+
def self.translation_table num
|
11
|
+
if Environment.instance.biolib
|
12
|
+
Biolib::Emboss.ajTrnNewI(num)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Precompile sequence for EMBOSS
|
17
|
+
def self.pre_translate seq,label
|
18
|
+
if Environment.instance.biolib
|
19
|
+
Biolib::Emboss.ajSeqNewNameC(seq,"Test sequence")
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Translate using frame (pre_seq is only used for EMBOSS)
|
26
|
+
#
|
27
|
+
# Valid frame values are 0,1,2,3 and -1,-2,-3, where 0 and 1 are the
|
28
|
+
# standard reading frame. The negative values translate the reverse
|
29
|
+
# complement of the strand.
|
30
|
+
def self.translate trn_table, frame, seq, pre_seq = nil
|
31
|
+
raise "Illegal frame #{frame}" if VALID_FRAME_VALUES.index(frame) == nil
|
32
|
+
frame = 1 if frame == 0
|
33
|
+
if Environment.instance.biolib
|
34
|
+
# Using EMBOSS for translation
|
35
|
+
ajpseq = pre_seq
|
36
|
+
if not pre_seq
|
37
|
+
ajpseq = Biolib::Emboss.ajSeqNewNameC(seq,"Test sequence")
|
38
|
+
end
|
39
|
+
ajpseqt = Biolib::Emboss.ajTrnSeqOrig(trn_table,ajpseq,frame)
|
40
|
+
Biolib::Emboss.ajSeqGetSeqCopyC(ajpseqt)
|
41
|
+
else
|
42
|
+
# Using BioRuby for translation
|
43
|
+
ntseq = if frame > 0
|
44
|
+
Bio::Sequence::NA.new(seq[frame-1..-1])
|
45
|
+
else
|
46
|
+
# This to match EMBOSS frames
|
47
|
+
rframe =
|
48
|
+
case frame
|
49
|
+
when -2
|
50
|
+
-3
|
51
|
+
when -3
|
52
|
+
-2
|
53
|
+
else
|
54
|
+
-1
|
55
|
+
end
|
56
|
+
Bio::Sequence::NA.new(seq[0..rframe]).reverse_complement
|
57
|
+
end
|
58
|
+
# pp ntseq
|
59
|
+
ntseq.translate.to_s
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module Big
|
4
|
+
class FastaEmitter
|
5
|
+
|
6
|
+
def initialize fn, max_size = 100000
|
7
|
+
@fn = fn
|
8
|
+
@max_size = max_size
|
9
|
+
end
|
10
|
+
|
11
|
+
# Yield sequence information in sections of a maximum
|
12
|
+
# size - usually iterators load the full sequence, but
|
13
|
+
# without penalty it is possible to use a lot less
|
14
|
+
# memory.
|
15
|
+
def emit_seq
|
16
|
+
f = File.open(@fn)
|
17
|
+
tag = tag_digest(f.gets.strip)
|
18
|
+
seq = ""
|
19
|
+
index = 0
|
20
|
+
begin
|
21
|
+
line = f.gets.strip
|
22
|
+
if line =~ /^>/
|
23
|
+
yield :tail,index,tag,seq
|
24
|
+
tag = tag_digest(line)
|
25
|
+
seq = ""
|
26
|
+
index += 1
|
27
|
+
else
|
28
|
+
seq += line
|
29
|
+
end
|
30
|
+
while seq.size > @max_size
|
31
|
+
yield :mid,index,tag,seq[0..@max_size-1]
|
32
|
+
seq = seq[@max_size..-1]
|
33
|
+
end
|
34
|
+
end while !f.eof
|
35
|
+
yield :tail,index,tag,seq
|
36
|
+
end
|
37
|
+
|
38
|
+
def tag_digest tag
|
39
|
+
if tag[0..0] == '>'
|
40
|
+
tag[1..-1]
|
41
|
+
else
|
42
|
+
raise "Tag error in '#{tag}'"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end # Big
|
48
|
+
end # Bio
|