RubyGems - bio-maf - Versions diffs - 0.3.2 → 1.0.0 - Mend

bio-maf 0.3.2 → 1.0.0

Files changed (33) hide show

data/Gemfile +3 -0
data/README.md +34 -3
data/bin/maf_bgzip +56 -0
data/bin/maf_index +0 -6
data/bio-maf.gemspec +3 -2
data/features/bgzf.feature +62 -0
data/features/maf-indexing.feature +10 -0
data/features/maf-querying.feature +9 -0
data/features/step_definitions/convert_steps.rb +1 -1
data/features/support/env.rb +2 -0
data/lib/bio/maf/index.rb +35 -13
data/lib/bio/maf/maf.rb +3 -1
data/lib/bio/maf/parser.rb +135 -39
data/lib/bio/maf/writer.rb +4 -4
data/man/maf_bgzip.1 +101 -0
data/man/maf_bgzip.1.ronn +85 -0
data/man/maf_extract.1 +5 -2
data/man/maf_extract.1.ronn +4 -1
data/man/maf_index.1 +3 -3
data/man/maf_index.1.ronn +3 -2
data/man/maf_tile.1 +5 -2
data/man/maf_tile.1.ronn +4 -1
data/man/maf_to_fasta.1 +1 -1
data/spec/bio/maf/index_spec.rb +7 -0
data/spec/bio/maf/parser_spec.rb +13 -0
data/spec/spec_helper.rb +2 -0
data/test/data/gap-1.kct +0 -0
data/test/data/mm8.chrM.maf +2421 -0
data/test/data/mm8.chrM.maf.bgz +0 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
data/test/data/mm8_chr7_tiny.maf.gz +0 -0
data/test/data/mm8_chrM_tiny.kct +0 -0
metadata +31 -3

data/Gemfile CHANGED

@@ -3,6 +3,9 @@ source "http://rubygems.org"
 gemspec
+## for local development
+#gem "bio-bgzf", :path => "../bioruby-bgzf"
 # Add dependencies to develop your gem here.
 # Include everything needed to run rake, tests, features, etc.
 group :development do

data/README.md CHANGED

@@ -8,9 +8,9 @@ support for the
 (MAF), used in bioinformatics to store whole-genome sets of multiple
 sequence alignments.
-Ultimately it will provide indexed and sequential access to MAF data,
-as well as performing various manipulations on it and writing modified
-MAF files. So far, it only supports simple sequential parsing.
+This library provides indexed and sequential access to MAF data, as
+well as performing various manipulations on it and writing modified
+MAF files.
 For more information, see the
 [project wiki](https://github.com/csw/bioruby-maf/wiki).
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
 idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
 ```
+### Compress and index a MAF file
+This library fully supports [BGZF][]-compressed MAF files, which
+combine gzip compression with blocking for efficient random
+access. These can be generated with blocking optimized for MAF access
+using the included
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
+tool. This writes BGZF-compressed MAF files and optionally indexes
+them as well:
+    $ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
+This is the easiest way to prepare MAF files for use with this library.
 ### Extract blocks from an indexed MAF file, by genomic interval
 Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
 end
 ```
+### Compression
+MAF files can optionally be compressed in the [BGZF][] format defined
+in the [SAM specification][]. This is best done with
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
+but files compressed with the `bgzip(1)` tool from samtools will also
+work, though less efficiently.
+[BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
+[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
+MAF files compressed with plain gzip will be decompressed on the fly,
+but random access to these files will not be possible. However,
+gzipped MAF files are suitable as input to
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
 ### Command line tools
 Man pages for command line tools:
+* [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
 * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
 * [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
 * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)

data/bin/maf_bgzip ADDED

@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+require 'ostruct'
+require 'bio-maf'
+require 'bio-bgzf'
+$options = OpenStruct.new
+$options.dir = '.'
+$options.ref_only = true
+op = OptionParser.new do |opts|
+  opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
+  opts.separator ""
+  opts.separator "Options:"
+  opts.on("-d", "--dir DIR",
+          "Directory to write compressed MAF to",
+          "(default is current directory)") do |dir|
+    $options.dir = dir
+  end
+  opts.on("-i", "--index", "Index MAF files after writing") do
+    $options.index = true
+  end
+  opts.on("-a", "--all",
+          "Index all sequences, not just reference seq",
+          "(has no effect without --index)") do
+    $options.ref_only = false
+  end
+end
+op.parse!(ARGV)
+until ARGV.empty?
+  maf_path = ARGV.shift
+  maf_base = File.basename(maf_path)
+  base = maf_base.gsub(/\.maf.*/, '')
+  bgz_path = "#{$options.dir}/#{base}.maf.bgz"
+  p = Bio::MAF::Parser.new(maf_path,
+                           :parse_extended => true,
+                           :parse_empty => true)
+  File.open(bgz_path, 'w') do |out_f|
+    Bio::BGZF::Writer.new(out_f) do |bgz_w|
+      maf_w = Bio::MAF::Writer.new(bgz_w)
+      maf_w.write_header(p.header)
+      p.each_block do |block|
+        maf_w.write_block(block)
+      end
+    end
+  end
+  p.close
+  if $options.index
+    p2 = Bio::MAF::Parser.new(bgz_path)
+    idx_path = "#{$options.dir}/#{base}.kct"
+    Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
+  end
+end

data/bin/maf_index CHANGED

@@ -14,15 +14,9 @@ PRINTERS = {
 $options = OpenStruct.new
 $options.mode = :build
 $options.ref_only = true
-$options.reader = if RUBY_PLATFORM == 'java'
-                    Bio::MAF::ThreadedChunkReader
-                  else
-                    Bio::MAF::ChunkReader
-                  end
 def build_index(maf, index)
   parser = Bio::MAF::Parser.new(maf,
-                                :chunk_reader => $options.reader,
                                 :parse_extended => false)
   idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
   idx.close

data/bio-maf.gemspec CHANGED

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = "bio-maf"
-  s.version = "0.3.2"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Clayton Wheeler"]
-  s.date = "2012-07-26"
+  s.date = "2012-08-02"
   s.description = "Multiple Alignment Format parser for BioRuby."
   s.email = "cswh@umich.edu"
   s.extra_rdoc_files = [
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
   end
   s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
+  s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
   s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
   s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
   if RUBY_PLATFORM == 'java'

data/features/bgzf.feature ADDED

@@ -0,0 +1,62 @@
+Feature: BGZF compression
+  Because MAF files are large
+  We need random access
+  But we would also like to compress them
+  Yet common compression formats don't facilitate random access
+  So we use BGZF compression to support random access
+  To 64 KB chunks
+  @no_jruby
+  Scenario: Compress a MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    When I run `maf_bgzip mm8_chr7_tiny.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+  @no_jruby
+  Scenario: Compress and index a MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    When I run `maf_bgzip -i mm8_chr7_tiny.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Compress a gzipped MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf.gz |
+    When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+  @no_jruby
+  Scenario: Compress and index a gzipped MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf.gz |
+    When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Compress multiple MAF files
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    | mm8.chrM.maf      |
+    When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8.chrM.maf.bgz" should exist

data/features/maf-indexing.feature CHANGED

@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
     """
     And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Build MAF index on BGZF file with CLI tool
+    Given test files:
+    | mm8.chrM.maf.bgz |
+    When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8.chrM.kct" should exist
   @no_jruby
   Scenario: Build MAF index on all sequences with CLI tool
     Given test files:

data/features/maf-querying.feature CHANGED

@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
     And search for blocks between positions 0 and 80100000 of mm8.chr7
     Then 3 blocks are obtained
+  @no_jruby
+  Scenario: Parse blocks from a BGZF-compressed file
+    Given test files:
+    | mm8.chrM.maf     |
+    | mm8.chrM.maf.bgz |
+    When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
+    And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
+    And I run `diff m1.maf m2.maf`
+    Then the exit status should be 0

data/features/step_definitions/convert_steps.rb CHANGED

@@ -6,7 +6,7 @@ end
 Given /^MAF data:$/ do |string|
   @src_f = Tempfile.new(['rspec', '.maf'])
   @src_f.write(string)
-  @src_f.close
+  @src_f.rewind
 end
 When /^I select FASTA output$/ do

data/features/support/env.rb CHANGED

@@ -1,3 +1,5 @@
+require 'bundler/setup'
 unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
   begin
     require 'simplecov'

data/lib/bio/maf/index.rb CHANGED

@@ -1,5 +1,6 @@
 require 'kyotocabinet'
 require 'jruby/profiler' if RUBY_PLATFORM == 'java'
+require 'bio-bgzf'
 #require 'bio-ucsc-api'
 require 'bio-genomic-interval'
@@ -189,18 +190,20 @@ module Bio
         @indices = {}
         @maf_by_chrom = {}
         if options[:dir]
-          @dir = options[:dir]
-          @maf_files = Dir.glob("#{@dir}/*.maf")
+          scan_dir(options[:dir])
         elsif options[:maf]
-          @maf_files = [options[:maf]]
           if options[:index]
             register_index(KyotoIndex.open(options[:index]),
                            options[:maf])
+          else
+            idx = find_index_file(options[:maf])
+            if idx
+              register_index(KyotoIndex.open(idx), options[:maf])
+            end
           end
         else
           raise "Must specify :dir or :maf!"
         end
-        scan_indices!
         if options[:maf] && @indices.empty?
           # MAF file explicitly given but no index
           # build a temporary one
@@ -215,23 +218,27 @@ module Bio
       # @api private
       def find_index_file(maf)
-        base = File.basename(maf, '.maf')
-        index_f = "#{@dir}/#{base}.kct"
-        File.exists?(index_f) ? index_f : nil
+        dir = File.dirname(maf)
+        base = File.basename(maf)
+        noext = base.gsub(/\.maf.*/, '')
+        idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
       end
       # @api private
       def register_index(index, maf)
+        unless index.maf_file == File.basename(maf)
+          raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
+        end
         @indices[index.ref_seq] = index
         @maf_by_chrom[index.ref_seq] = maf
       end
       # @api private
-      def scan_indices!
-        @maf_files.each do |maf|
-          index_f = find_index_file(maf)
-          if index_f
-            index = KyotoIndex.open(index_f)
+      def scan_dir(dir)
+        Dir.glob("#{dir}/*.kct").each do |index_f|
+          index = KyotoIndex.open(index_f)
+          maf = "#{dir}/#{index.maf_file}"
+          if File.exist? maf
             register_index(index, maf)
           end
         end
@@ -262,9 +269,12 @@ module Bio
     class KyotoIndex
       include KVHelpers
-      attr_reader :db, :species, :species_max_id, :ref_only
+      attr_reader :db, :species, :species_max_id, :ref_only, :path
+      attr_reader :maf_file
       attr_accessor :index_sequences, :ref_seq
+      COMPRESSION_KEY = 'bio-maf:compression'
+      FILE_KEY = 'bio-maf:file'
       FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
       FORMAT_VERSION = 2
       REF_SEQ_KEY = 'bio-maf:reference-sequence'
@@ -438,6 +448,7 @@ module Bio
           raise "Could not open DB file!"
         end
         if mode == KyotoCabinet::DB::OREADER
+          @maf_file = db[FILE_KEY]
           self.ref_seq = db[REF_SEQ_KEY]
           load_index_sequences
           load_species
@@ -450,6 +461,7 @@ module Bio
       end
       def dump(stream=$stdout)
+        bgzf = (db[COMPRESSION_KEY] == 'bgzf')
         stream.puts "KyotoIndex dump: #{@path}"
         stream.puts
         if db.count == 0
@@ -474,6 +486,11 @@ module Bio
             offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
             stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
             stream.puts "  offset #{offset}, length #{len}"
+            if bgzf
+              block = Bio::BGZF.vo_block_offset(offset)
+              data = Bio::BGZF.vo_data_offset(offset)
+              stream.puts "  BGZF block offset #{block}, data offset #{data}"
+            end
             stream.puts "  text size: #{text_size}"
             stream.puts "  sequences in block: #{n_seq}"
             stream.printf("  species vector: %016x\n", species_vec)
@@ -660,6 +677,11 @@ module Bio
       end
       def build(parser, ref_only=true)
+        db[FILE_KEY] = File.basename(parser.file_spec)
+        @maf_file = db[FILE_KEY]
+        if parser.compression
+          db[COMPRESSION_KEY] = parser.compression.to_s
+        end
         first_block = parser.parse_block
         self.ref_seq = first_block.sequences.first.source
         @ref_only = ref_only

data/lib/bio/maf/maf.rb CHANGED

@@ -103,8 +103,9 @@ module Bio
       GAP = /-+/
-      # Remove gaps present in all sequences. These would generally
+      # Find gaps present in all sequences. These would generally
       # occur when some sequences have been filtered out.
+      #
       # @see #remove_gaps!
       # @see Parser#sequence_filter
       def find_gaps
@@ -126,6 +127,7 @@ module Bio
       # Remove gaps present in all sequences. These would generally
       # occur when some sequences have been filtered out.
+      #
       # @see #find_gaps
       # @see Parser#sequence_filter
       def remove_gaps!

data/lib/bio/maf/parser.rb CHANGED

@@ -1,5 +1,6 @@
 require 'strscan'
 require 'java' if RUBY_PLATFORM == 'java'
+require 'bio-bgzf'
 # @api public
 module Bio
@@ -56,8 +57,7 @@ module Bio
       # Reads a chunk of the file.
       #
-      # Currently always reads size_hint bytes but this may change
-      # with BGZF support.
+      # Currently always reads size_hint bytes.
       #
       # @param [Integer] offset file offset to read from.
       # @param [Integer] size_hint desired size of chunk.
@@ -70,58 +70,97 @@ module Bio
       end
     end
-    # Variant ChunkReader using a read-ahead thread with internal
-    # queue for sequential parsing. Not useful for random-access
-    # parsing.
-    #
-    # Only beneficial on JRuby.
-    class ThreadedChunkReader < ChunkReader
+    class BGZFChunkReader
+      attr_reader :f, :r
-      def initialize(f, chunk_size, buffer_size=64)
-        super(f, chunk_size)
-        @buffer = SizedQueue.new(buffer_size)
-        @eof_reached = false
-        start_read_ahead
+      def initialize(f, _chunk_size)
+        @f = f
+        @r = Bio::BGZF::Reader.new(f)
       end
+      def pos
+        r.tell
+      end
+      def read_chunk
+        r.read_block
+      end
+      def read_chunk_at(vo, _size)
+        r.read_block_at(vo)
+      end
+    end
+    class ThreadedChunkReaderWrapper
+      attr_reader :cr, :pos
+      def initialize(cr, buffer_size=64)
+        @cr = cr
+        @buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
+        @eof_reached = false
+        @first_seq_read = false
+      end
       # Spawn a read-ahead thread. Called from {#initialize}.
       def start_read_ahead
         @read_thread = Thread.new { read_ahead }
       end
+      def f
+        cr.f
+      end
       # Read ahead into queue.
       def read_ahead
         # n = 0
         begin
-          f_pos = 0
           until f.eof?
-            chunk = f.read(@chunk_size)
-            @buffer << [f_pos, chunk]
-            f_pos += chunk.bytesize
-            # n += 1
-            # if (n % 100) == 0
-            #   $stderr.puts "buffer size: #{@buffer.size}"
-            # end
+            chunk = cr.read_chunk
+            c_pos = cr.pos
+            @buffer.put([c_pos, chunk])
           end
-          @eof_reached = true
+          @buffer.put(:eof)
+          # @eof_reached = true
         rescue Exception
           @read_ahead_ex = $!
-          $stderr.puts "read_ahead aborting: #{$!}"
+          LOG.error $!
+          @buffer.put($!)
         end
       end
-      # (see ChunkReader#read_chunk)
       def read_chunk
-        raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
-        if @eof_reached && @buffer.empty?
+        if ! @first_seq_read
+          # this is the first read_chunk call to read the header
+          # not necessarily indicative of sequential access
+          @first_seq_read = true
+          chunk = cr.read_chunk
+          @pos = cr.pos
+          return chunk
+        elsif @read_ahead_ex
+          raise @read_ahead_ex
+        elsif @eof_reached
           return nil
         else
-          c_pos, chunk = @buffer.shift()
-          @pos = c_pos
-          return chunk
+          start_read_ahead if @read_thread.nil?
+          e = @buffer.take
+          case
+          when e == :eof
+            @eof_reached = nil
+            return nil
+          when e.is_a?(Exception)
+            raise e
+          else
+            c_pos, chunk = e
+            @pos = c_pos
+            return chunk
+          end
         end
       end
+      def read_chunk_at(*args)
+        cr.read_chunk_at(*args)
+      end
     end
     # MAF parsing code useful for sequential and random-access parsing.
@@ -385,8 +424,7 @@ module Bio
         @f = fd
         @parser = parser
         @opts = parser.opts
-        reader = opts[:chunk_reader] || ChunkReader
-        @cr = reader.new(@f, chunk_size)
+        @cr = parser.base_reader.new(@f, chunk_size)
         @last_block_pos = -1
       end
@@ -413,6 +451,7 @@ module Bio
       # @return [Array<Block>]
       def fetch_blocks(offset, len, block_offsets)
         if block_given?
+          LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
           start_chunk_read_if_needed(offset, len)
           # read chunks until we have the entire merged set of
           # blocks ready to parse
@@ -420,6 +459,13 @@ module Bio
           append_chunks_to(len)
           # parse the blocks
           block_offsets.each do |expected_offset|
+            # skip ahead, in case there is a gap resulting from a
+            # block that is not being parsed
+            rel_offset = expected_offset - offset
+            if s.pos < rel_offset
+              s.pos = rel_offset
+            end
+            # now actually parse the block data
             block = _parse_block
             parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
             parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
@@ -444,7 +490,6 @@ module Bio
       end
       def append_chunks_to(len)
-        # XXX: need to rethink this for BGZF; prefetching ChunkReader
         while s.string.size < len
           s.string << cr.read_chunk()
         end
@@ -463,8 +508,6 @@ module Bio
     #  * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
     #  * `:merge_max`: merge up to this many bytes of blocks for
     #    random access
-    #  * `:chunk_reader`: use the specified class to read
-    #    chunks. (Only useful with {ThreadedChunkReader}).
     #  * `:threads`: number of threads to use for parallel
     #    parsing. Only useful under JRuby.
     # @api public
@@ -482,6 +525,9 @@ module Bio
       attr_reader :s
       # @return [ChunkReader] ChunkReader.
       attr_reader :cr
+      # @return [Class] ChunkReader class to use for random access
+      # @see ParseContext
+      attr_reader :base_reader
       # @return [Boolean] whether EOF has been reached.
       attr_reader :at_end
       # @return [Hash] parser options.
@@ -490,6 +536,8 @@ module Bio
       attr_reader :chunk_start
       # @return [Integer] offset of the last block start in this chunk.
       attr_reader :last_block_pos
+      # @return [Symbol] compression method used for this file, or nil
+      attr_reader :compression
       # @api private
       attr_accessor :parse_extended
@@ -515,10 +563,29 @@ module Bio
         @parse_extended = opts[:parse_extended] || false
         @parse_empty = opts[:parse_empty] || false
         @chunk_start = 0
-        @file_spec = file_spec
-        @f = File.open(file_spec)
-        reader = opts[:chunk_reader] || ChunkReader
-        @cr = reader.new(@f, chunk_size)
+        if file_spec.respond_to? :flush
+          # guess what, Pathnames respond to :read...
+          @f = file_spec
+          @file_spec = @f.path if @f.respond_to?(:path)
+          # TODO: gzip?
+        else
+          @file_spec = file_spec
+          if file_spec.to_s.end_with?(".maf.gz")
+            @f = IO.popen("gzip -dc #{file_spec}")
+          else
+            @f = File.open(file_spec)
+          end
+        end
+        if @file_spec.to_s =~ /\.bgzf?$/
+          @base_reader = BGZFChunkReader
+          @compression = :bgzf
+        else
+          @base_reader = ChunkReader
+        end
+        @cr = base_reader.new(@f, chunk_size)
+        if RUBY_PLATFORM == 'java'
+          @cr = ThreadedChunkReaderWrapper.new(@cr)
+        end
         @s = StringScanner.new(cr.read_chunk())
         set_last_block_pos!
         @at_end = false
@@ -536,7 +603,11 @@ module Bio
       # @api private
       def context(chunk_size)
         # IO#dup calls dup(2) internally, but seems broken on JRuby...
-        fd = File.open(file_spec)
+        if file_spec
+          fd = File.open(file_spec)
+        else
+          fd = f.dup
+        end
         ParseContext.new(fd, chunk_size, self)
       end
@@ -679,6 +750,15 @@ module Bio
       #
       # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
       def merge_fetch_list(orig_fl)
+        case compression
+        when nil
+          _merge_fetch_list(orig_fl)
+        when :bgzf
+          _merge_bgzf_fetch_list(orig_fl)
+        end
+      end
+      def _merge_fetch_list(orig_fl)
         fl = orig_fl.dup
         r = []
         until fl.empty? do
@@ -698,6 +778,22 @@ module Bio
         return r
       end
+      # Build a merged fetch list in a BGZF-aware way.  This will
+      # group together all MAF blocks from a single BGZF block. These
+      # MAF blocks may not be consecutive.
+      def _merge_bgzf_fetch_list(orig_fl)
+        block_e = orig_fl.chunk { |entry|
+          Bio::BGZF::vo_block_offset(entry[0])
+        }
+        block_e.collect do |bgzf_block, fl|
+          # text size to read from disk, from the start of the first
+          # block to the end of the last block
+          text_size = fl.last[0] + fl.last[1] - fl.first[0]
+          offsets = fl.collect { |e| e[0] }
+          [fl.first[0], text_size, offsets]
+        end
+      end
       # Parse the header of the MAF file.
       def _parse_header
         parse_error("not a MAF file") unless s.scan(/##maf\s*/)