RubyGems - bio-bigbio - Versions diffs - 0.1.1 - Mend

bio-bigbio 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/Gemfile +15 -0
data/Gemfile.lock +34 -0
data/LICENSE +34 -0
data/README.rdoc +28 -0
data/Rakefile +50 -0
data/VERSION +1 -0
data/bin/getorf +118 -0
data/bin/nt2aa.rb +56 -0
data/bio-bigbio.gemspec +102 -0
data/doc/bigbio_getorf.wtex +14 -0
data/lib/bigbio/adapters/translate.rb +64 -0
data/lib/bigbio/db/blast/blastclust.rb +16 -0
data/lib/bigbio/db/blast.rb +2 -0
data/lib/bigbio/db/emitters/fasta_emitter.rb +48 -0
data/lib/bigbio/db/emitters/orf_emitter.rb +289 -0
data/lib/bigbio/db/fasta/fastaindex.rb +3 -0
data/lib/bigbio/db/fasta/fastapairedreader.rb +19 -0
data/lib/bigbio/db/fasta/fastapairedwriter.rb +21 -0
data/lib/bigbio/db/fasta/fastareader.rb +132 -0
data/lib/bigbio/db/fasta/fastarecord.rb +39 -0
data/lib/bigbio/db/fasta/fastawriter.rb +20 -0
data/lib/bigbio/db/fasta/indexer.rb +33 -0
data/lib/bigbio/db/fasta.rb +13 -0
data/lib/bigbio/environment.rb +12 -0
data/lib/bigbio/sequence/predictorf.rb +140 -0
data/lib/bigbio/sequence/translate.rb +52 -0
data/lib/bigbio.rb +38 -0
data/spec/emitter_spec.rb +265 -0
data/spec/predictorf_spec.rb +199 -0
data/test/data/EMBOSS/EGC.1 +32 -0
data/test/data/fasta/nt.fa +1000 -0
data/test/doctest/test_fasta.rb +112 -0
data/test/doctest/test_frames.rb +76 -0
data/test/doctest/test_getorf.rb +154 -0
data/test/doctest/test_paired.rb +55 -0
data/test/performance/translate_with_biolib.rb +67 -0
data/test/performance/translate_with_bioruby.rb +64 -0
metadata +163 -0

data/lib/bigbio/db/emitters/orf_emitter.rb ADDED Viewed

@@ -0,0 +1,289 @@
+require 'set'
+module Bio
+  module Big
+    module FrameCodonHelpers
+      STOP_CODONS = Set.new(%w{TAG TAA TGA UAG UAA UGA})
+      START_CODONS = Set.new(%w{ATG AUG})
+  # Track sequence position in parent sequence (in nucleotides)
+      module TrackSequenceTrait
+        attr_accessor :track_ntseq_pos
+        def TrackSequenceTrait.update_sequence_pos orfs, ntseq_pos
+          orfs.each { | orf | orf.track_ntseq_pos = ntseq_pos + orf.pos*3 }
+          orfs
+        end
+        def TrackSequenceTrait.update_reversed_sequence_pos orfs, ntseq_pos
+          # is the same
+          orfs.each { | orf | orf.track_ntseq_pos = ntseq_pos + orf.pos*3 }
+          orfs
+        end
+      end
+      # Functions that move a frame forward, or backward,
+      # creating new short frames.
+      module CreateShortFrame
+        def CreateShortFrame.create_right fr,orfs,rseq
+          seq = fr.seq
+          ntseq_pos = fr.ntseq_pos
+          remove = if orfs.size > 0
+            orfs.last.rpos*3
+          else
+            0
+          end
+          ntseq_pos += remove
+          nseq = seq[remove..-1] + rseq
+          ShortFrameState.new nseq,ntseq_pos,fr.min_size_codons*3
+        end
+        def CreateShortFrame.create_left fr,orfs,nseq
+          # Reversed (real locations on contig):
+          #
+          # |  3                21  B |
+          # ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
+          # ...^--============xxx^=======xxx
+          #       ^                     ^
+          # Actual feed:
+          #
+          # s2=              s1=
+          # "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
+          #  ......xxx=====   ~===xx^============--^...
+          #  0  1  2  3        0  1  2  3
+          seq1 = fr.seq             # original sequence
+          len1 = seq1.size
+          ntseq_pos1 = fr.ntseq_pos # right side of seq (|)
+          bridge = len1 % 3    # chomp left side (B)
+          remove = if orfs.size > 0
+            len1 - bridge - (orfs.first.pos)*3 + 1
+          else
+            0
+          end
+          ntseq_pos2 = ntseq_pos1+remove-1  # pos against main contig
+          seq2 = nseq + seq1[0..(len1-remove)]
+          ShortReversedFrameState.new seq2,ntseq_pos2,fr.min_size_codons*3
+        end
+      end
+      class FrameCodonSequence
+        include Enumerable
+        include TrackSequenceTrait
+        attr_reader :pos     # codon position in short parent sequence
+        attr_reader :codons
+        def initialize seq, pos=0
+          if seq.kind_of?(String)
+            @codons = seq.upcase.scan(/(\w\w\w)/).flatten
+          else
+            @codons = seq
+          end
+          @pos = pos
+        end
+        def size
+          @codons.size
+        end
+        def rpos
+          pos + size
+        end
+        def [] index
+          @codons[index]
+        end
+        def shift
+          list = @codons
+          list.shift
+          FrameCodonSequence.new(list,@pos+1)
+        end
+        def to_seq
+          @codons.join
+        end
+        def each
+          @codons.each { | c| yield c }
+        end
+      end
+    end # FrameCodonHelpers
+    # The short frame uses the simplest concept to find ORFs. The sequence is
+    # immutable, always forward and in frame 0. That makes it easy to reason.
+    # It also return all ORF's in one go, with the left/right locations.
+    class ShortFrameState
+      include FrameCodonHelpers
+      attr_reader :seq, :ntseq_pos, :min_size_codons, :codons
+      def initialize seq, ntseq_pos, ntmin_size
+        @reversed = nil
+        # @seq = seq.upcase
+        @seq = seq
+        @min_size_codons = if ntmin_size > 3
+                             (ntmin_size/3).to_i
+                           else
+                             2  # otherwise we get single STOP codons
+                           end
+        @codons = FrameCodonSequence.new(seq,ntseq_pos)
+        @ntseq_pos = ntseq_pos # nucleotides
+        # @codons.track_sequence_pos = seq_pos
+      end
+      # Return a list of ORFs delimited by STOP codons.
+      def get_stopstop_orfs
+        get_codon_orfs1(Proc.new { | codon | STOP_CODONS.include?(codon) },false,true)
+      end
+      # Return a list of ORFs delimited by START-STOP codons
+      def get_startstop_orfs
+        get_codon_orfs2(
+                 Proc.new { | codon | STOP_CODONS.include?(codon) },
+                 Proc.new { | codon | START_CODONS.include?(codon) })
+      end
+      # Splitter for one delimiter function. +include_leftmost+ decides
+      # the first sequence is returned when incomplete. +strip_leading+
+      # is used to remove the shared codon with the last sequence.
+      #
+      def get_codon_orfs1 splitter_func,do_include_leftmost_orf,do_strip_leading_codon
+        orfs = split(@codons,splitter_func)
+        return [] if orfs.size == 0
+        # Drop the first sequence, if there is no match on the first position
+        orfs.shift if !do_include_leftmost_orf and !splitter_func.call(orfs.first[0])
+        orfs = orfs.map { |codons|
+          codons = codons.shift if do_strip_leading_codon and splitter_func.call(codons[0])
+          codons
+        }
+        if @reversed == nil
+          TrackSequenceTrait.update_sequence_pos(orfs,@ntseq_pos) # nail against parent
+        else
+          TrackSequenceTrait.update_reversed_sequence_pos(orfs,@ntseq_pos) # nail against parent
+        end
+      end
+      # Splitter for two delimeter functions
+      def get_codon_orfs2 splitter_func, start_func
+        orfs = get_codon_orfs1(splitter_func,true,true)
+        orfs.find_all { | orf | start_func.call(orf[0]) }
+      end
+      # Return list of codon sequences, split on the +is_splitter+
+      # function.
+      #
+      def split codons, is_splitter_func
+        list = []
+        node = []
+        codons.each_with_index do | c, pos |
+          # p [c,pos]
+          if is_splitter_func.call(c)
+            node.push c
+            size = node.size
+            # p node
+            list.push FrameCodonSequence.new(node,pos+1-size) if size > @min_size_codons
+            node = []
+          end
+          node.push c  # always push boundary codon
+        end
+        list
+      end
+    end
+    # This is the reversed version, which is rather the same as the forward,
+    # though the tracked ntseq_pos should be seen from the end of the sequence,
+    # as we are emmiting sequences from the end(!) Also we need to make sure
+    # the sequence is always in frame (from the left).
+    class ShortReversedFrameState < ShortFrameState
+      attr_accessor :reversed
+      def initialize seq, ntseq_pos, ntmin_size
+        @reversed = true
+        chop = seq.size % 3 # align on codons
+        super seq[chop..-1],ntseq_pos,ntmin_size
+        @seq = seq # but record full seq
+      end
+    end
+    class OrfEmitter
+      # 6-frame ORF emitter for (growing) sequences from the +emit+
+      # object. Type can be a symbol or a function. Symbols are
+      #
+      #   :stopstop   All sequences from STOP to STOP codon
+      #   :startstop  All sequences from START to STOP codon
+      #
+      # size control is in nucleotides.
+      #
+      # The difference with most other getorf implementations, including
+      # EMBOSS, is that:
+      #
+      # 1) ORFs get emitted during the reading of large continuous sequences,
+      #    e.g. chromosomes.
+      # 2) This allows processing in parallel to IO, even on a single CPU
+      # 3) ORFs come with splitting CODONs
+      # 4) Bordering ORFs are not included (by default), which is somehow
+      #    not easy with EMBOSS getorf
+      #
+      # I have carefully designed this code, so it is easy to reason about
+      # the steps and prove correct. It is easy to understand, and
+      # therefore to parallelize correctly. Some features are:
+      #
+      # 5) Emit size does not matter for correctness
+      # 6) Reverse strands are positioned according to
+      #    GFF3 on the parent contig
+      #
+      def initialize emit, type, min_size=30, max_size=nil
+        @em = emit
+        @type = type
+        @min_size = min_size
+        @max_size = max_size
+      end
+      # Concats sequences from the emitter and yields the
+      # contained ORFs for every resulting frame (-3..-1, 1..3 ). Note
+      # that for the reverse frame, the resulting sequence is complemented!
+      # Translate these sequences in a forward frame only.
+      #
+      # First :head, then :mid parts get emitted, closed by the :tail part.
+      #
+      def emit_seq
+        @em.emit_seq do | part, index, tag, seq |
+          # p [part, seq]
+          # case part do
+          #   when :head
+          #   when :mid
+          #   when :tail
+          # end
+          emit_forward(part, index, tag, seq) { |*x| yield(*x) }
+          emit_reverse(part, index, tag, seq) { |*x| yield(*x) }
+        end
+      end
+      private
+      def emit_forward(part, index, tag, seq)
+        # Yield frame 1..3
+        (1..3).each do | frame |
+          fr = ShortFrameState.new seq[frame-1..-1],0,0
+          orfs = fr.get_stopstop_orfs
+          orfs.each do | orf |
+            yield frame, index, tag, orf.track_ntseq_pos, orf.to_seq
+          end
+        end
+      end
+      def emit_reverse(part, index, tag, seq)
+        # Yield frame -1..-3
+        ntseq = Bio::Sequence::NA.new(seq)
+        rev_seq = ntseq.complement
+        (1..3).each do | frame |
+          fr = ShortReversedFrameState.new rev_seq[0..rev_seq.size-frame],0,0
+          orfs = fr.get_stopstop_orfs
+          orfs.each do | orf |
+            yield(-frame,index,tag,orf.track_ntseq_pos,orf.to_seq)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/bigbio/db/fasta/fastaindex.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class FastaIndex
+end

data/lib/bigbio/db/fasta/fastapairedreader.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# FASTA paired reader keeps track of two FASTA files containing
+# matching NT and AA sequences.
+#
+class FastaPairedReader
+  def initialize ntfn, aafn, opts={:regex => '(\S+)'}
+    @nt = FastaReader.new(ntfn, opts)
+    @aa = FastaReader.new(aafn, opts)
+  end
+  # return a NT+AA pair
+  def get id
+    nt = @nt.get(id)
+    aa = @aa.get(id)
+    FastaPairedRecord.new(nt, aa)
+  end
+end

data/lib/bigbio/db/fasta/fastapairedwriter.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# Paired FASTA writer (tracks matching NT and AA sequences in two
+# FASTA files)
+#
+class FastaPairedWriter
+  def initialize ntfn, aafn
+    @nt = FastaWriter.new(ntfn)
+    @aa = FastaWriter.new(aafn)
+  end
+  def write rec
+    @nt.write rec.nt
+    @aa.write rec.aa
+  end
+  def close
+    @nt.close
+    @aa.close
+  end
+end

data/lib/bigbio/db/fasta/fastareader.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# Indexed FastaReader
+#
+require 'bigbio/db/fasta/indexer'
+class FastaReader
+  include Indexer
+  # Initalize the reader of FASTA file _fn_. Options can be :regex and
+  # :index (true/false)
+  def initialize fn, opts = {}
+    @f = File.open(fn)
+    @fread_once = false
+    @regex = opts[:regex]
+    @regex = '^(\S+)' if @regex == nil
+    indexer_use opts[:index]
+  end
+  # Parse the FASTA file and yield id, descr, sequence. When the indexer is on
+  # it will index the records the first time. Note that, with indexing, when
+  # you don't complete parsing there will be an error the second time. This is
+  # a  # trade-off, otherwise one would always have to index the file and read
+  # it twice.
+  def parse_each
+    @f.seek 0    # force file rewind
+    @rec_fpos = 0
+    @rec_line = @f.gets
+    fpos = 0
+    @count = 0
+    begin
+      # digest id from record description
+      id, descr = digest_tag(@rec_line)
+      id_fpos = @rec_fpos
+      # parse the sequence
+      seq = ""
+      begin
+        fpos = @f.tell
+        line = @f.gets
+        break if line =~ /^>/
+        seq += line.strip
+      end while !@f.eof
+      # new record
+      @count += 1
+      @rec_fpos = fpos
+      @rec_line = line
+      # p [@rec_line, id, id_fpos]
+      indexer_set(id, id_fpos) if @indexer and not @fread_once
+      yield id, descr, seq
+    end while !@f.eof
+    @fread_once = true
+  end
+  # returns a FastaRecord for every item (invokes parse_each)
+  def each
+    parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) }
+  end
+  def first
+    parse_each { | id, descr, seq |
+      return FastaRecord.new(id, descr, seq)
+    }
+  end
+  # Return a record by its +id+, nil when not found
+  def get id
+    indexed?
+    if fpos = indexer_get(id)
+      get_rec(fpos)
+    else
+      nil
+    end
+  end
+  def get_rec fpos
+    @f.seek fpos
+    tag = @f.gets
+    seq = ""
+    begin
+      line = @f.gets
+      break if line =~ /^>/
+      seq += line.strip
+    end while !@f.eof
+    id, descr = digest_tag(tag)
+    FastaRecord.new(id,descr,seq)
+  end
+  def get_by_index idx
+    indexed?
+    if fpos = indexer_get_by_index(idx)[1]
+      ret = get_rec(fpos)
+      return ret
+    end
+    nil
+  end
+  def digest_tag tag
+    if tag =~ /^>/
+      descr = $'.strip
+      if descr =~ /#{@regex}/
+        id = $1
+        # p [descr,id]
+        return id, descr
+      end
+      p descr  # do not remove these
+      p @regex
+    end
+    raise "Can not digest '#{tag}' using '"+@regex+"'"
+  end
+  # Returns the size of the dataset - as read. After the final
+  # record the size represents the number of items in the FASTA file
+  def size
+    @count
+  end
+  def close
+    @f.close
+  end
+  private
+  def indexed?
+    if @indexer and not @fread_once
+      # force indexer
+      # $stderr.print "Force indexer"
+      parse_each { | x, y, z | nil }
+    end
+    true
+  end
+end

data/lib/bigbio/db/fasta/fastarecord.rb ADDED Viewed

@@ -0,0 +1,39 @@
+class FastaRecord
+  attr_accessor :id, :descr, :seq
+  def initialize id, descr, seq
+    @id = id
+    @descr = descr
+    @seq = seq
+  end
+end
+class FastaPairedRecord
+  attr_reader :nt, :aa
+  def initialize nt, aa
+    @nt = nt
+    @aa = aa
+    raise "ID error NT #{nt.id} not matching AA #{aa.id}" if nt.id != aa.id
+    if nt.seq.size == aa.seq.size*3-1
+      # account for EMBOSS cleverness
+      nt.seq.chop!
+      nt.seq.chop!
+      aa.seq.chop!
+    end
+    if nt.seq.size == aa.seq.size*3-2
+      # account for EMBOSS cleverness
+      nt.seq.chop!
+      aa.seq.chop!
+    end
+    if nt.seq.size == aa.seq.size*3-3
+      aa.seq.chop!
+    end
+    raise "Sequence size mismatch for #{nt.id} <nt:#{nt.seq.size} != #{aa.seq.size*3} (aa:#{aa.seq.size}*3)>" if nt.seq.size != aa.seq.size*3
+  end
+  def id
+    @aa.id
+  end
+end

data/lib/bigbio/db/fasta/fastawriter.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# Fasta writer
+class FastaWriter
+  # Open a FASTA stream for writing
+  def initialize fn
+    @f = File.open(fn,"w")
+  end
+  # write a FASTA item
+  def write item
+    @f.write ">"+item.id+' '+item.descr+"\n"
+    @f.write item.seq.strip+"\n"
+  end
+  def close
+    @f.close
+  end
+end

data/lib/bigbio/db/fasta/indexer.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# Indexer module for the FASTA class
+#
+# This is a simple memory based key storage
+#
+module Indexer
+  # Start using the indexer
+  def indexer_use state
+    if state
+      @indexer = {}
+    end
+  end
+  def indexer_set key, fpos
+    raise "Trying to use 'set' when there is no index" if @indexer == nil
+    raise "Indexer key #{key} alread in use for <#{@indexer[key]}>!" if @indexer[key]
+    # p [key, fpos]
+    @indexer[key] = fpos
+  end
+  # Get the key, return nil when not found
+  def indexer_get key
+    raise "Trying to use 'get' when there is no index" if @indexer == nil
+    # raise "Indexer key #{key} not found!" if !@indexer[key]
+    @indexer[key]
+  end
+  def indexer_get_by_index idx
+    @indexer.sort {|a,b| a[1]<=>b[1]} [idx]
+  end
+end

data/lib/bigbio/db/fasta.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# fasta.rb
+#
+# This is a Bilib reference implementation of a FASTA reader and writer in
+# Ruby.
+#
+# by Pjotr Prins (c) 2009
+#
+require 'bigbio/db/fasta/fastarecord'
+require 'bigbio/db/fasta/fastareader'
+require 'bigbio/db/fasta/fastawriter'
+require 'bigbio/db/fasta/fastapairedreader'
+require 'bigbio/db/fasta/fastapairedwriter'

data/lib/bigbio/environment.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'singleton'
+module Bio
+  module Big
+    class Environment
+      include Singleton
+      attr_accessor :log, :biolib
+    end
+  end
+end