RubyGems - bio-maf - Versions diffs - 0.1.0 → 0.2.0 - Mend

bio-maf 0.1.0 → 0.2.0

Files changed (26) hide show

data/.gitignore +53 -0
data/DEVELOPMENT.md +29 -0
data/Gemfile +1 -0
data/README.md +69 -1
data/Rakefile +4 -3
data/bin/find_overlaps +21 -0
data/bin/maf_tile +103 -0
data/bio-maf.gemspec +43 -0
data/features/gap-filling.feature +158 -0
data/features/gap-removal.feature +50 -0
data/features/step_definitions/gap-filling_steps.rb +32 -0
data/features/step_definitions/gap_removal_steps.rb +19 -0
data/features/step_definitions/parse_steps.rb +2 -1
data/lib/bio/maf/index.rb +15 -8
data/lib/bio/maf/maf.rb +267 -0
data/lib/bio/maf/parser.rb +115 -175
data/lib/bio/maf/tiler.rb +167 -0
data/lib/bio/maf.rb +2 -0
data/man/maf_tile.1 +108 -0
data/man/maf_tile.1.ronn +104 -0
data/spec/bio/maf/index_spec.rb +1 -0
data/spec/bio/maf/parser_spec.rb +103 -0
data/spec/bio/maf/tiler_spec.rb +69 -0
data/test/data/gap-sp1.fa +6 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
metadata +58 -3

data/lib/bio/maf/parser.rb CHANGED Viewed

@@ -9,142 +9,6 @@ module Bio
     # @api public
     class ParseError < Exception; end
-    # A MAF header, containing the variable-value pairs from the first
-    # line of the file as well as the alignment parameters.
-    # @api public
-    class Header
-      # Variable-value pairs from the ##maf line
-      # @return [Hash]
-      attr_accessor :vars
-      # Alignment parameters from the MAF header.
-      # @return [Hash]
-      attr_accessor :alignment_params
-      def initialize(vars, params)
-        @vars = vars
-        @alignment_params = params
-      end
-      # The required version parameter.
-      # @return [String]
-      def version
-        vars[:version]
-      end
-      # The optional scoring parameter, if present.
-      # @return [String]
-      def scoring
-        vars[:scoring]
-      end
-    end
-    # A MAF alignment block.
-    # @api public
-    class Block
-      # Parameters from the 'a' line starting the alignment block.
-      attr_reader :vars
-      # Sequences, one per 's' or 'e' line.
-      # @return [Array<Sequence>]
-      attr_reader :sequences
-      # Offset of the alignment block within the MAF file, in bytes.
-      # @return [Integer]
-      attr_reader :offset
-      # Size of the alignment block within the MAF file, in bytes.
-      # @return [Integer]
-      attr_reader :size
-      def initialize(*args)
-        @vars, @sequences, @offset, @size = args
-      end
-      def raw_seq(i)
-        sequences.fetch(i)
-      end
-      def each_raw_seq
-        sequences.each { |s| yield s }
-      end
-      # Text size of the alignment block. This is the number of text
-      # characters in each line of sequence data, including dashes and
-      # other gaps in the sequence.
-      def text_size
-        sequences.first.text.size
-      end
-    end
-    # A sequence within an alignment block.
-    # @api public
-    class Sequence
-      # @return [String] Source sequence name.
-      attr_reader :source
-      # @return [Integer] Zero-based start position.
-      attr_reader :start
-      # @return [Integer] Size of aligning region in source sequence.
-      attr_reader :size
-      # :+ or :-, indicating which strand the alignment is to.
-      # @return [Symbol]
-      attr_reader :strand
-      # Size of the entire source sequence, not just the aligning
-      # region.
-      # @return [Integer]
-      attr_reader :src_size
-      # Sequence data for the alignment, including insertions.
-      # @return [String]
-      attr_reader :text
-      # Array of raw synteny information from 'i' line.
-      # @return [Array<String>]
-      attr_accessor :i_data
-      # Quality string from 'q' line.
-      # @return [String]
-      attr_accessor :quality
-      alias_method :source_size, :src_size
-      def initialize(*args)
-        @source, @start, @size, @strand, @src_size, @text = args
-      end
-      # Whether this sequence is empty. Only true for {EmptySequence}
-      # instances from 'e' lines.
-      def empty?
-        false
-      end
-      def write_fasta(writer)
-        writer.write("#{source}:#{start}-#{start + size}",
-                     text)
-      end
-    end
-    # An empty sequence record from an 'e' line.
-    #
-    # This indicates that "there isn't aligning DNA for a species but
-    # that the current block is bridged by a chain that connects
-    # blocks before and after this block" (MAF spec).
-    # @api public
-    class EmptySequence < Sequence
-      attr_reader :status
-      def initialize(*args)
-        super(*args[0..4])
-        @status = args[5]
-      end
-      def text
-        ''
-      end
-      def empty?
-        true
-      end
-      def write_fasta(writer)
-        raise "empty sequence output not implemented!"
-      end
-    end
     # Reads MAF files in chunks.
     # @api private
     class ChunkReader
@@ -399,16 +263,25 @@ module Bio
           payload = s.rest
           s.pos = s.string.size # jump to EOS
         end
+        filtered = false
         lines = payload.split("\n")
         until lines.empty?
           line = lines.shift
           first = line.getbyte(0)
           if first == S
             seq = parse_seq_line(line, sequence_filter)
-            seqs << seq if seq
+            if seq
+              seqs << seq
+            else
+              filtered = true
+            end
           elsif first == E && parse_empty
             e_seq = parse_empty_line(line, sequence_filter)
-            seqs << e_seq if e_seq
+            if e_seq
+              seqs << e_seq
+            else
+              filtered = true
+            end
           elsif first == I && parse_extended
             parts = line.split
             parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
@@ -423,10 +296,19 @@ module Bio
             parse_error "unexpected line: '#{line}'"
           end
         end
-        return Block.new(block_vars,
-                         seqs,
-                         block_offset,
-                         s.pos - block_start_pos)
+        block = Block.new(block_vars,
+                          seqs,
+                          block_offset,
+                          s.pos - block_start_pos,
+                          filtered)
+        postprocess_block(block)
+      end
+      def postprocess_block(block)
+        if block.filtered? && opts[:remove_gaps]
+          block.remove_gaps!
+        end
+        block
       end
       # Parse an 's' line.
@@ -503,12 +385,13 @@ module Bio
     # A MAF parsing context, used for random-access parsing.
     class ParseContext
       include MAFParsing
-      attr_accessor :f, :s, :cr, :parser
+      attr_accessor :f, :s, :cr, :parser, :opts
       attr_accessor :chunk_start, :last_block_pos, :at_end
-      def initialize(fd, chunk_size, parser, opts)
+      def initialize(fd, chunk_size, parser)
         @f = fd
         @parser = parser
+        @opts = parser.opts
         reader = opts[:chunk_reader] || ChunkReader
         @cr = reader.new(@f, chunk_size)
         @last_block_pos = -1
@@ -580,6 +463,7 @@ module Bio
     #
     #  * `:parse_extended`: whether to parse 'i' and 'q' lines
     #  * `:parse_empty`: whether to parse 'e' lines
+    #  * `:remove_gaps`: remove gaps left after filtering sequences
     #  * `:chunk_size`: read MAF file in chunks of this many bytes
     #  * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
     #  * `:merge_max`: merge up to this many bytes of blocks for
@@ -611,9 +495,6 @@ module Bio
       attr_reader :chunk_start
       # @return [Integer] offset of the last block start in this chunk.
       attr_reader :last_block_pos
-      # Sequence filter to apply.
-      # @api public
-      attr_accessor :sequence_filter
       # @api private
       attr_accessor :parse_extended
@@ -630,6 +511,9 @@ module Bio
       # @api public
       def initialize(file_spec, opts={})
         @opts = opts
+        if RUBY_PLATFORM == 'java'
+          opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
+        end
         chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
         @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
         @merge_max = opts[:merge_max] || MERGE_MAX
@@ -654,7 +538,7 @@ module Bio
       def context(chunk_size)
         # IO#dup calls dup(2) internally, but seems broken on JRuby...
         fd = File.open(file_spec)
-        ParseContext.new(fd, chunk_size, self, @opts)
+        ParseContext.new(fd, chunk_size, self)
       end
       # Execute the given block with a {ParseContext} using the given
@@ -671,6 +555,20 @@ module Bio
         end
       end
+      # Sequence filter to apply.
+      # @api public
+      # @return [Hash]
+      def sequence_filter
+        @sequence_filter ||= {}
+      end
+      # Set the sequence filter.
+      # @api public
+      # @param [Hash] filter the new filter
+      def sequence_filter=(filter)
+        @sequence_filter = filter
+      end
       # Fetch and parse blocks given by `fetch_list`.
       #
       # `fetch_list` should be an array of `[offset, length]` tuples.
@@ -723,25 +621,22 @@ module Bio
           # TODO: break entries up into longer runs for more
           # sequential I/O
           jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
-          completed = java.util.concurrent.LinkedBlockingQueue.new(128)
+          ct = CompletionTracker.new(fetch_list)
+          completed = ct.queue
           threads = []
-          n_threads.times { threads << make_worker(jobs, completed) }
-          n_completed = 0
-          while (n_completed < fetch_list.size)
-            c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
-            if c.nil?
-              if threads.find { |t| t.alive? }
-                next
-              else
-                raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
-              end
+          n_threads.times { threads << make_worker(jobs, ct) }
+          n_res = 0
+          while n_res < fetch_list.size
+            c = completed.poll(1, java.util.concurrent.TimeUnit::SECONDS)
+            unless c
+              raise "Worker failed!" if threads.find { |t| t.status.nil? }
+              next
             end
-            raise "worker failed: #{c}" if c.is_a? Exception
             c.each do |block|
               y << block
             end
-            n_completed += 1
+            n_res += 1
           end
           threads.each { |t| t.join }
           elapsed = Time.now - start
@@ -758,26 +653,25 @@ module Bio
       # Create a worker thread for parallel parsing.
       #
       # @see #fetch_blocks_merged_parallel
-      def make_worker(jobs, completed)
+      def make_worker(jobs, ct)
         Thread.new do
-          with_context(@random_access_chunk_size) do |ctx|
-            while true
-              req = jobs.poll
-              break unless req
-              begin
+          begin
+            with_context(@random_access_chunk_size) do |ctx|
+              while true
+                req = jobs.poll
+                break unless req
                 n_blocks = req[2].size
                 blocks = ctx.fetch_blocks(*req).to_a
                 if blocks.size != n_blocks
                   raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
                 end
-                completed.put(blocks)
-              rescue Exception => e
-                completed.put(e)
-                $stderr.puts "Worker failing: #{e.class}: #{e}"
-                $stderr.puts e.backtrace.join("\n")
-                raise e
+                ct << blocks
               end
             end
+          rescue Exception => e
+            $stderr.puts "Worker failing: #{e.class}: #{e}"
+            $stderr.puts e.backtrace.join("\n")
+            raise e
           end
         end
       end
@@ -860,14 +754,19 @@ module Bio
         end
         Enumerator.new do |y|
           saw_eof = false
-          while worker.alive?
+          n_final_poll = 0
+          while true
             block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
             if block == :eof
               saw_eof = true
               break
             elsif block
               y << block
+            else
+              # timed out
+              n_final_poll += 1 unless worker.alive?
             end
+            break if n_final_poll > 1
           end
           unless saw_eof
             raise "worker exited unexpectedly!"
@@ -883,6 +782,47 @@ module Bio
     end
+    class CompletionTracker
+      attr_reader :queue, :offsets, :delayed
+      def initialize(fetch_list)
+        @offsets = fetch_list.collect { |e| e[0] }
+        @queue = java.util.concurrent.LinkedBlockingQueue.new(128)
+        @delayed = {}
+        @sem = Mutex.new
+      end
+      def next_expected
+        offsets.first
+      end
+      def <<(blocks)
+        @sem.synchronize do
+          f_offset = blocks.first.offset
+          if f_offset == next_expected
+            offsets.shift
+            queue.put(blocks)
+            drain_delayed
+          else
+            # out of order
+            delayed[f_offset] = blocks
+          end
+        end
+      end
+      def drain_delayed
+        while e = delayed.delete(next_expected)
+          offsets.shift
+          queue.put(e)
+        end
+      end
+    end
+    # Exposes parser internals for unit tests.
+    class DummyParser
+      include MAFParsing
+    end
   end
 end

data/lib/bio/maf/tiler.rb ADDED Viewed

@@ -0,0 +1,167 @@
+require 'zlib'
+module Bio::MAF
+  # Tiles a given genomic interval.
+  # Inspired by: lib/bx/align/tools/tile.py in bx-python
+  class Tiler
+    attr_accessor :index
+    attr_accessor :parser
+    attr_accessor :reference
+    # GenomicInterval
+    attr_accessor :interval
+    attr_accessor :species
+    attr_accessor :species_map
+    def initialize
+      @species_map = {}
+    end
+    def ref_data(range)
+      if reference
+        if reference.respond_to? :read_interval
+          reference.read_interval(range.begin, range.end)
+        elsif reference.is_a? String
+          reference.slice(range)
+        else
+          raise "Unhandled reference data source: #{reference}"
+        end
+      else
+        nil
+      end
+    end
+    def tile
+      parser.sequence_filter[:only_species] = @species
+      # TODO: remove gaps
+      blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
+      mask = Array.new(interval.length, :ref)
+      i_start = interval.zero_start
+      i_end = interval.zero_end
+      if reference
+        ref_region = ref_data(i_start...i_end)
+      end
+      blocks.each do |block|
+        ref = block.ref_seq
+        slice_start = [i_start, ref.start].max
+        slice_end = [i_end, ref.end].min
+        mask.fill(block,
+                  (slice_start - i_start)...(slice_end - i_start))
+      end
+      text = []
+      species.each { |s| text << '' }
+      nonref_text = text[1...text.size]
+      runs(mask) do |range, block|
+        g_range = (range.begin + i_start)...(range.end + i_start)
+        if block == :ref
+          # not covered by an alignment block
+          # use the reference sequence if given, otherwise 'N'
+          range_size = range.end - range.begin
+          text[0] << if ref_region
+                       ref_region.slice(range)
+                     else
+                       'N' * range_size
+                     end
+          stars = '*' * range_size
+          nonref_text.each { |t| t << stars }
+        else
+          # covered by an alignment block
+          t_range = block.ref_seq.text_range(g_range)
+          species.each_with_index do |species, i|
+            sp_text = text[i]
+            seq = block.sequences.find { |s| s.source == species || s.species == species }
+            if seq
+              # got alignment text
+              sp_text << seq.text.slice(t_range)
+            else
+              # no alignment for this one here, use '*'
+              sp_text << '*' * (t_range.end - t_range.begin)
+            end
+          end
+        end
+      end
+      text
+    end
+    def write_fasta(f)
+      species.zip(tile()) do |species, text|
+        sp_out = species_map[species] || species
+        f.puts ">#{sp_out}"
+        f.puts text
+      end
+    end
+    def runs(mask)
+      cur = nil
+      cur_start = nil
+      mask.each_with_index do |obj, i|
+        if ! cur.equal?(obj)
+          yield(cur_start...i, cur) if cur
+          cur = obj
+          cur_start = i
+        end
+      end
+      yield(cur_start...mask.size, cur)
+    end
+  end
+  class FASTARangeReader
+    attr_reader :f, :pos
+    def initialize(fspec)
+      if fspec.respond_to? :seek
+        @f = fspec
+      else
+        reader_class = if fspec =~ /.gz$/
+                         Zlib::GzipReader
+                       else
+                         File
+                       end
+        @f = reader_class.open(fspec)
+      end
+      position_at_start
+    end
+    GT = '>'.getbyte(0)
+    def position_at_start
+      first = f.readline
+      raise "expected FASTA comment" unless first =~ /^>/
+      @pos = 0
+    end
+    def read_interval(z_start, z_end)
+      if z_start < pos
+        position_at_start
+      end
+      data = ''
+      region_size = z_end - z_start
+      in_region = false
+      f.each_line do |line_raw|
+        if line_raw.getbyte(0) == GT
+          raise "unexpected description line: #{line_raw.inspect}"
+        end
+        line = line_raw.strip
+        end_pos = pos + line.size
+        if (! in_region) && pos <= z_start && z_start < end_pos
+          data << line.slice((z_start - pos)...(line.size))
+          in_region = true
+        elsif in_region
+          need = region_size - data.size
+          if need > line.size
+            data << line
+          else
+            # last line
+            data << line.slice(0, need)
+            break
+          end
+        end
+        @pos = end_pos
+      end
+      return data
+    end
+  end
+end

data/lib/bio/maf.rb CHANGED Viewed

@@ -1,4 +1,6 @@
+require 'bio/maf/maf'
 require 'bio/maf/struct'
 require 'bio/maf/index'
 require 'bio/maf/parser'
 require 'bio/maf/writer'
+require 'bio/maf/tiler'

data/man/maf_tile.1 ADDED Viewed

@@ -0,0 +1,108 @@
+.\" generated with Ronn/v0.7.3
+.\" http://github.com/rtomayko/ronn/tree/0.7.3
+.
+.TH "MAF_TILE" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.
+.SH "NAME"
+\fBmaf_tile\fR \- synthesize an alignment for a given region
+.
+.SH "SYNOPSIS"
+\fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
+.
+.P
+\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
+.
+.SH "DESCRIPTION"
+\fBmaf_tile\fR takes a MAF file with index (generated by maf_index(1)), extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
+.
+.P
+If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<start>:<end>\.fa\fR will be appended to the given  parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
+.
+.P
+Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
+.
+.SH "OPTIONS"
+.
+.TP
+\fB\-r\fR, \fB\-\-reference SEQ\fR
+The FASTA reference sequence file given, which may be gzipped, will be used to fill in any gaps between alignment blocks\.
+.
+.TP
+\fB\-i\fR, \fB\-\-interval BEGIN:END\fR
+The given zero\-based genomic interval will be used to select alignment blocks from the MAF file\.
+.
+.TP
+\fB\-s\fR, \fB\-\-species SPECIES[:NAME]\fR
+The given species will be selected for output\. If given as \fBspecies:name\fR, it will appear in the FASTA output as \fIname\fR\.
+.
+.TP
+\fB\-b\fR, \fB\-\-bed BED\fR
+The given BED file will be used to provide a list of intervals to process\. If present, \fB\-\-interval\fR will be ignored and \fB\-\-output\-base\fR must be given as well\.
+.
+.TP
+\fB\-o\fR, \fB\-\-output\-base BASE\fR
+The given path will be used as the base name for output files, as described above\.
+.
+.SH "EXAMPLES"
+Generate an alignment of the \fBhg19\fR, \fBpetMar1\fR, and \fBornAna1\fR sequences from \fBchrY\.maf\fR over the interval 14400 to 15000 on the reference sequence of the MAF file\. Fills in gaps from \fBchrY\.refseq\.fa\.gz\fR\. Writes FASTA output to stdout\.
+.
+.IP "" 4
+.
+.nf
+$ maf_tile \-\-reference ~/maf/chrY\.refseq\.fa\.gz \e
+  \-\-interval 14400:15000 \e
+  \-s hg19:human \-s petMar1 \-s ornAna1 \e
+  chrY\.maf chrY\.kct
+>human
+GGGTGACGAAAAGAGCCGA\-\-\-\-\-[\.\.\.]
+>petMar1
+gagtgccggggagtgccggggagt[\.\.\.]
+>ornAna1
+AGGGATCTGGGAATTCTGG\-\-\-\-\-[\.\.\.]
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Write out a FASTA file for each interval in the given BED file, prefixed with \fB/tmp/mm8\fR, and without filling in data from a reference sequence:
+.
+.IP "" 4
+.
+.nf
+$ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
+  \-s mm8:mouse \-s rn4:rat \-s hg18:human \e
+  mm8_chr7_tiny\.maf mm8_chr7_tiny\.kct
+.
+.fi
+.
+.IP "" 0
+.
+.SH "FILES"
+The output is generated in FASTA format, with one sequence per species\.
+.
+.P
+The input \fImaf\fR file must be a Multiple Alignment Format file\.
+.
+.P
+The \fIindex\fR must be a MAF index built with maf_index(1)\.
+.
+.P
+If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
+.
+.SH "ENVIRONMENT"
+\fBmaf_tile\fR is a Ruby program and relies on ordinary Ruby environment variables\.
+.
+.SH "COPYRIGHT"
+\fBmaf_tile\fR is copyright (C) 2012 Clayton Wheeler\.
+.
+.SH "SEE ALSO"
+maf_index(1), ruby(1)
+.
+.IP "\(bu" 4
+\fIhttps://github\.com/csw/bioruby\-maf/\fR
+.
+.IP "" 0