RubyGems - bio-bigbio - Versions diffs - 0.1.1 - Mend

bio-bigbio 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/Gemfile +15 -0
data/Gemfile.lock +34 -0
data/LICENSE +34 -0
data/README.rdoc +28 -0
data/Rakefile +50 -0
data/VERSION +1 -0
data/bin/getorf +118 -0
data/bin/nt2aa.rb +56 -0
data/bio-bigbio.gemspec +102 -0
data/doc/bigbio_getorf.wtex +14 -0
data/lib/bigbio/adapters/translate.rb +64 -0
data/lib/bigbio/db/blast/blastclust.rb +16 -0
data/lib/bigbio/db/blast.rb +2 -0
data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
data/lib/bigbio/db/fasta/fastareader.rb +132 -0
data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
data/lib/bigbio/db/fasta/indexer.rb +33 -0
data/lib/bigbio/db/fasta.rb +13 -0
data/lib/bigbio/environment.rb +12 -0
data/lib/bigbio/sequence/predictorf.rb +140 -0
data/lib/bigbio/sequence/translate.rb +52 -0
data/lib/bigbio.rb +38 -0
data/spec/emitter_spec.rb +265 -0
data/spec/predictorf_spec.rb +199 -0
data/test/data/EMBOSS/EGC.1 +32 -0
data/test/data/fasta/nt.fa +1000 -0
data/test/doctest/test_fasta.rb +112 -0
data/test/doctest/test_frames.rb +76 -0
data/test/doctest/test_getorf.rb +154 -0
data/test/doctest/test_paired.rb +55 -0
data/test/performance/translate_with_biolib.rb +67 -0
data/test/performance/translate_with_bioruby.rb +64 -0
metadata +163 -0

data/Gemfile ADDED Viewed

@@ -0,0 +1,15 @@
+source "http://rubygems.org"
+# Add dependencies required to use your gem here.
+# Example:
+#   gem "activesupport", ">= 2.3.5"
+gem "bio", ">= 1.4.1"
+gem "bio-logger", ">= 0.9.0"
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "rspec", "~> 2.3.0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler", "~> 1.5.2"
+  gem "rcov", ">= 0"
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,34 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    bio (1.4.1)
+    bio-logger (0.9.0)
+      log4r (>= 1.1.9)
+    diff-lcs (1.1.2)
+    git (1.2.5)
+    jeweler (1.5.2)
+      bundler (~> 1.0.0)
+      git (>= 1.2.5)
+      rake
+    log4r (1.1.9)
+    rake (0.8.7)
+    rcov (0.9.9)
+    rspec (2.3.0)
+      rspec-core (~> 2.3.0)
+      rspec-expectations (~> 2.3.0)
+      rspec-mocks (~> 2.3.0)
+    rspec-core (2.3.1)
+    rspec-expectations (2.3.0)
+      diff-lcs (~> 1.1.2)
+    rspec-mocks (2.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bio (>= 1.4.1)
+  bio-logger (>= 0.9.0)
+  bundler (~> 1.0.0)
+  jeweler (~> 1.5.2)
+  rcov
+  rspec (~> 2.3.0)

data/LICENSE ADDED Viewed

@@ -0,0 +1,34 @@
+If a license is not specified the code contributed to BioBig defaults to the
+BSD license:
+Copyright (c) 2008, 2009 The BioLib Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the The BioLib Project nor the names of
+      its contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For more information on opensource software licenses see
+http://www.opensource.org/licenses/bsd-license.php,
+http://www.gnu.org/licenses/gpl.html and http://www.fsf.org/.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,28 @@
+= BIGBIO
+BigBio = BIG DATA for Ruby
+BigBio is an initiative to a create high performance libraries for big data
+computing in biology.
+BigBio may use BioLib C/C++/D functions for increasing performance and
+reducing memory consumption.
+This is an experimental project. If you wish to contribute subscribe
+to the BioRuby and/or BioLib mailing lists.
+== Functionality
+* BigBio can translate nucleotide sequences to amino acid
+  sequences using an EMBOSS C function, or BioRuby's translator.
+* BigBio has an ORF emitter which parses DNA/RNA sequences and emits
+  ORFs between START_STOP or STOP_STOP codons.
+* BigBio has a FASTA file emitter, with iterates FASTA files and
+  iterates sequences without loading everything in memory.
+== Copyright
+Copyright (c) 2011-2012 Pjotr Prins. See LICENSE for further details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,50 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "bio-bigbio"
+  gem.homepage = "http://github.com/pjotrp/bioruby-bigbioruby"
+  gem.license = "MIT"
+  gem.summary = %Q{Low memory sequence emitters}
+  gem.description = %Q{Fasta reader, ORF emitter, sequence translation}
+  gem.email = "pjotr.public01@thebird.nl"
+  gem.authors = ["Pjotr Prins"]
+  # Include your dependencies below. Runtime dependencies are required when using your gem,
+  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
+  #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
+  #  gem.add_development_dependency 'rspec', '> 1.2.3'
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "bio-bigbio #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.1

data/bin/getorf ADDED Viewed

@@ -0,0 +1,118 @@
+#! /usr/bin/ruby
+#
+# Predict ORF's from nucleotide sequences using the BigBio predictors.
+# The input is a fasta file, the output consists of
+# a FASTA amino acid sequence file with matching nucleotide sequences
+# (aa_heuristic.fa and nt_heuristic.fa respectively)
+#
+# You can choose the heuristic on the command line (default stopstop).
+#
+# Author:: Pjotr Prins
+# Copyright:: 2009-2011
+# License:: Ruby License
+#
+# Copyright (C) 2009-2011 Pjotr Prins <pjotr.prins@thebird.nl>
+rootpath = File.dirname(File.dirname(__FILE__))
+$: << File.join(rootpath,'lib')
+BIGBIO_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+USAGE =<<EOM
+  ruby #{__FILE__} [-h stopstop] [--min-size 30] inputfile(s)
+  Use --help for more options
+EOM
+# require 'biolib/emboss'
+require 'bigbio'
+require 'optparse'
+$stderr.print "getorf BioRuby BigBio Plugin "+BIGBIO_VERSION+" Copyright (C) 2009-2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
+Bio::Log::CLI.logger('stderr')
+Bio::Log::CLI.trace('info')
+heuristic = 'stopstop'
+minsize   = 30
+longest_match = false
+opts = OptionParser.new() { |opts|
+  opts.on_tail("-?", "--help", "Print this message") {
+    print(USAGE)
+    print(opts)
+    print <<EXAMPLE
+EXAMPLE
+    exit()
+  }
+  opts.on("-h heuristic", String, "Heuristic (stopstop)") do | s |
+    heuristic = s
+  end
+  opts.on("-s size", "--min-size", Integer, "Minimal sequence size") do | n |
+    minsize = n
+  end
+  opts.on("--longest", "Only get longest ORF match") do
+    longest_match = true
+  end
+  opts.on("--logger filename",String,"Log to file (default stderr)") do | name |
+    Bio::Log::CLI.logger(name)
+  end
+  opts.on("--trace options",String,"Set log level (default INFO, see bio-logger)
+") do | s |
+    Bio::Log::CLI.trace(s)
+  end
+  opts.on("-q", "--quiet", "Run quietly") do |q|
+    Bio::Log::CLI.trace('error')
+  end
+  opts.on("-v", "--verbose", "Run verbosely") do |v|
+    Bio::Log::CLI.trace('info')
+  end
+  opts.on("--debug", "Show debug messages") do |v|
+    Bio::Log::CLI.trace('debug')
+    options.debug = true
+  end
+}
+opts.parse!(ARGV)
+if ARGV.size == 0
+  print USAGE
+  exit 1
+end
+Bio::Log::CLI.configure('bigbio')
+# print "Heuristic is #{heuristic}\n"
+# print "Minsize #{minsize}\n"
+ARGV.each do | fn |
+  raise "File #{fn} does not exist" if !File.exist?(fn)
+  nt = Bio::Big::FastaEmitter.new(fn)
+  include Bio::Big::TranslationAdapter
+  trn_table = Bio::Big::TranslationAdapter.translation_table(1)
+  id = 0
+  nt.emit_seq do | where, location, tag, seq |
+    id += 1
+    # p [where, location, tag, seq]
+    predict = PredictORF.new(id,tag,seq,trn_table)
+    orflist = predict.send(heuristic,minsize)
+    orflist.each do | orf |
+      print '>',orf.descr,"\n"
+      print orf.nt.to_s,"\n"
+      # p orf
+    end
+  end
+end

data/bin/nt2aa.rb ADDED Viewed

@@ -0,0 +1,56 @@
+#! /usr/bin/ruby
+#
+# Translate nucleotide sequences into aminoacids sequences in all
+# reading frames.
+#
+#
+# (: pjotrp 2009, 2012 rblicense :)
+#
+# Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>
+USAGE =<<EOM
+  ruby #{__FILE__} [--six-frame] inputfile(s)
+EOM
+$: << File.dirname(__FILE__)+'/../lib'
+require 'bigbio'
+if ARGV.size < 1
+  print USAGE
+  exit 1
+end
+do_sixframes = false
+frames = [1]
+if ARGV[0] == '--six-frame'
+  ARGV.shift!
+  do_sixframes = true
+  frames = [-3,-2,-1,1,2,3]
+end
+require 'bigbio/adapters/translate'
+ARGV.each do | fn |
+  raise "File #{fn} does not exist" if !File.exist?(fn)
+  nt = FastaReader.new(fn)
+  trn_table = Bio::Big::TranslationAdapter.translation_table(1)
+  nt.each { | rec |
+      ajpseq   = Bio::Big::TranslationAdapter.pre_translate(rec.seq,"Test sequence")
+      frames.each do | frame |
+        aa  = Bio::Big::TranslationAdapter.translate(trn_table,frame, rec.seq, ajpseq)
+        # ajpseqt  = Biolib::Emboss.ajTrnSeqOrig(trnTable,ajpseq,frame)
+        # aa       = Biolib::Emboss.ajSeqGetSeqCopyC(ajpseqt)
+        print "> ",rec.descr," [",frame.to_s,"]\n"
+        print aa,"\n"
+    end
+  }
+end

data/bio-bigbio.gemspec ADDED Viewed

@@ -0,0 +1,102 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "bio-bigbio"
+  s.version = "0.1.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Pjotr Prins"]
+  s.date = "2012-01-30"
+  s.description = "Fasta reader, ORF emitter, sequence translation"
+  s.email = "pjotr.public01@thebird.nl"
+  s.executables = ["getorf", "nt2aa.rb"]
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.rdoc"
+  ]
+  s.files = [
+    "Gemfile",
+    "Gemfile.lock",
+    "LICENSE",
+    "README.rdoc",
+    "Rakefile",
+    "VERSION",
+    "bin/getorf",
+    "bin/nt2aa.rb",
+    "bio-bigbio.gemspec",
+    "doc/bigbio_getorf.wtex",
+    "lib/bigbio.rb",
+    "lib/bigbio/adapters/translate.rb",
+    "lib/bigbio/db/blast.rb",
+    "lib/bigbio/db/blast/blastclust.rb",
+    "lib/bigbio/db/emitters/fasta_emitter.rb",
+    "lib/bigbio/db/emitters/orf_emitter.rb",
+    "lib/bigbio/db/fasta.rb",
+    "lib/bigbio/db/fasta/fastaindex.rb",
+    "lib/bigbio/db/fasta/fastapairedreader.rb",
+    "lib/bigbio/db/fasta/fastapairedwriter.rb",
+    "lib/bigbio/db/fasta/fastareader.rb",
+    "lib/bigbio/db/fasta/fastarecord.rb",
+    "lib/bigbio/db/fasta/fastawriter.rb",
+    "lib/bigbio/db/fasta/indexer.rb",
+    "lib/bigbio/environment.rb",
+    "lib/bigbio/sequence/predictorf.rb",
+    "lib/bigbio/sequence/translate.rb",
+    "spec/emitter_spec.rb",
+    "spec/predictorf_spec.rb",
+    "test/data/EMBOSS/EGC.1",
+    "test/data/fasta/nt.fa",
+    "test/doctest/test_fasta.rb",
+    "test/doctest/test_frames.rb",
+    "test/doctest/test_getorf.rb",
+    "test/doctest/test_paired.rb",
+    "test/performance/translate_with_biolib.rb",
+    "test/performance/translate_with_bioruby.rb"
+  ]
+  s.homepage = "http://github.com/pjotrp/bioruby-bigbioruby"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.10"
+  s.summary = "Low memory sequence emitters"
+  s.test_files = [
+    "spec/emitter_spec.rb",
+    "spec/predictorf_spec.rb",
+    "test/doctest/test_fasta.rb",
+    "test/doctest/test_frames.rb",
+    "test/doctest/test_getorf.rb",
+    "test/doctest/test_paired.rb",
+    "test/performance/translate_with_biolib.rb",
+    "test/performance/translate_with_bioruby.rb"
+  ]
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])
+      s.add_runtime_dependency(%q<bio-logger>, [">= 0.9.0"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+    else
+      s.add_dependency(%q<bio>, [">= 1.4.1"])
+      s.add_dependency(%q<bio-logger>, [">= 0.9.0"])
+      s.add_dependency(%q<rspec>, ["~> 2.3.0"])
+      s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<bio>, [">= 1.4.1"])
+    s.add_dependency(%q<bio-logger>, [">= 0.9.0"])
+    s.add_dependency(%q<rspec>, ["~> 2.3.0"])
+    s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+  end
+end

data/doc/bigbio_getorf.wtex ADDED Viewed

@@ -0,0 +1,14 @@
+== ORF Prediction ==
+The ORF 'predictor' is of the simple kind. It detects ORFs by
+identifying start and stop signals in the same frame.
+The main feature is that it does not consume real memory through the
+use of a Sequence 'emitter', which scans a large input sequence and
+yields open reading frames from STOP to STOP codon, with adjoining
+nucleotides. This allows scanning the sequence in a *single* pass.
+The input file maybe Fasta, as long as the reader yields chunks of
+sequence data.

data/lib/bigbio/adapters/translate.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# TranslationAdapter will translate using EMBOSS, or BioRuby
+# when the first is not available
+module Bio
+  module Big
+    module TranslationAdapter
+      VALID_FRAME_VALUES = [ 0, -1, -2, -3, 1, 2, 3 ]
+      def self.translation_table num
+        if Environment.instance.biolib
+          Biolib::Emboss.ajTrnNewI(num)
+        end
+      end
+      # Precompile sequence for EMBOSS
+      def self.pre_translate seq,label
+        if Environment.instance.biolib
+          Biolib::Emboss.ajSeqNewNameC(seq,"Test sequence")
+        else
+          nil
+        end
+      end
+      # Translate using frame (pre_seq is only used for EMBOSS)
+      #
+      # Valid frame values are 0,1,2,3 and -1,-2,-3, where 0 and 1 are the
+      # standard reading frame. The negative values translate the reverse
+      # complement of the strand.
+      def self.translate trn_table, frame, seq, pre_seq = nil
+        raise "Illegal frame #{frame}" if VALID_FRAME_VALUES.index(frame) == nil
+        frame = 1 if frame == 0
+        if Environment.instance.biolib
+          # Using EMBOSS for translation
+          ajpseq = pre_seq
+          if not pre_seq
+            ajpseq = Biolib::Emboss.ajSeqNewNameC(seq,"Test sequence")
+          end
+          ajpseqt  = Biolib::Emboss.ajTrnSeqOrig(trn_table,ajpseq,frame)
+          Biolib::Emboss.ajSeqGetSeqCopyC(ajpseqt)
+        else
+          # Using BioRuby for translation
+          ntseq = if frame > 0
+            Bio::Sequence::NA.new(seq[frame-1..-1])
+          else
+            # This to match EMBOSS frames
+            rframe =
+              case frame
+                when -2
+                  -3
+                when -3
+                  -2
+                else
+                  -1
+              end
+            Bio::Sequence::NA.new(seq[0..rframe]).reverse_complement
+          end
+          # pp ntseq
+          ntseq.translate.to_s
+        end
+      end
+    end
+  end
+end

data/lib/bigbio/db/blast/blastclust.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# Parse BLAST cluster file
+#
+class BlastClust
+  def initialize fn
+    @fn = fn
+  end
+  def each
+    File.new(@fn).each_line do | line |
+      yield line.split
+    end
+  end
+end

data/lib/bigbio/db/blast.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'bigbio/db/blast/blastclust'
2	+

data/lib/bigbio/db/emitters/fasta_emitter.rb ADDED Viewed

@@ -0,0 +1,48 @@
+module Bio
+  module Big
+    class FastaEmitter
+      def initialize fn, max_size = 100000
+        @fn = fn
+        @max_size = max_size
+      end
+      # Yield sequence information in sections of a maximum
+      # size - usually iterators load the full sequence, but
+      # without penalty it is possible to use a lot less
+      # memory.
+      def emit_seq
+        f = File.open(@fn)
+        tag = tag_digest(f.gets.strip)
+        seq = ""
+        index = 0
+        begin
+          line = f.gets.strip
+          if line =~ /^>/
+            yield :tail,index,tag,seq
+            tag = tag_digest(line)
+            seq = ""
+            index += 1
+          else
+            seq += line
+          end
+          while seq.size > @max_size
+            yield :mid,index,tag,seq[0..@max_size-1]
+            seq = seq[@max_size..-1]
+          end
+        end while !f.eof
+        yield :tail,index,tag,seq
+      end
+      def tag_digest tag
+        if tag[0..0] == '>'
+          tag[1..-1]
+        else
+          raise "Tag error in '#{tag}'"
+        end
+      end
+    end
+  end # Big
+end # Bio