RubyGems - bio-maf - Versions diffs - 0.3.2-java → 1.0.0-java - Mend

bio-maf 0.3.2-java → 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/Gemfile +3 -0
data/README.md +34 -3
data/bin/maf_bgzip +56 -0
data/bin/maf_index +0 -6
data/bio-maf.gemspec +3 -2
data/features/bgzf.feature +62 -0
data/features/maf-indexing.feature +10 -0
data/features/maf-querying.feature +9 -0
data/features/step_definitions/convert_steps.rb +1 -1
data/features/support/env.rb +2 -0
data/lib/bio/maf/index.rb +35 -13
data/lib/bio/maf/maf.rb +3 -1
data/lib/bio/maf/parser.rb +135 -39
data/lib/bio/maf/writer.rb +4 -4
data/man/maf_bgzip.1 +101 -0
data/man/maf_bgzip.1.ronn +85 -0
data/man/maf_extract.1 +5 -2
data/man/maf_extract.1.ronn +4 -1
data/man/maf_index.1 +3 -3
data/man/maf_index.1.ronn +3 -2
data/man/maf_tile.1 +5 -2
data/man/maf_tile.1.ronn +4 -1
data/man/maf_to_fasta.1 +1 -1
data/spec/bio/maf/index_spec.rb +7 -0
data/spec/bio/maf/parser_spec.rb +13 -0
data/spec/spec_helper.rb +2 -0
data/test/data/gap-1.kct +0 -0
data/test/data/mm8.chrM.maf +2421 -0
data/test/data/mm8.chrM.maf.bgz +0 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
data/test/data/mm8_chr7_tiny.maf.gz +0 -0
data/test/data/mm8_chrM_tiny.kct +0 -0
metadata +30 -2

data/Gemfile CHANGED Viewed

@@ -3,6 +3,9 @@ source "http://rubygems.org"
 gemspec
+## for local development
+#gem "bio-bgzf", :path => "../bioruby-bgzf"
 # Add dependencies to develop your gem here.
 # Include everything needed to run rake, tests, features, etc.
 group :development do

data/README.md CHANGED Viewed

@@ -8,9 +8,9 @@ support for the
 (MAF), used in bioinformatics to store whole-genome sets of multiple
 sequence alignments.
-Ultimately it will provide indexed and sequential access to MAF data,
-as well as performing various manipulations on it and writing modified
-MAF files. So far, it only supports simple sequential parsing.
+This library provides indexed and sequential access to MAF data, as
+well as performing various manipulations on it and writing modified
+MAF files.
 For more information, see the
 [project wiki](https://github.com/csw/bioruby-maf/wiki).
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
 idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
 ```
+### Compress and index a MAF file
+This library fully supports [BGZF][]-compressed MAF files, which
+combine gzip compression with blocking for efficient random
+access. These can be generated with blocking optimized for MAF access
+using the included
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
+tool. This writes BGZF-compressed MAF files and optionally indexes
+them as well:
+    $ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
+This is the easiest way to prepare MAF files for use with this library.
 ### Extract blocks from an indexed MAF file, by genomic interval
 Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
 end
 ```
+### Compression
+MAF files can optionally be compressed in the [BGZF][] format defined
+in the [SAM specification][]. This is best done with
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
+but files compressed with the `bgzip(1)` tool from samtools will also
+work, though less efficiently.
+[BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
+[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
+MAF files compressed with plain gzip will be decompressed on the fly,
+but random access to these files will not be possible. However,
+gzipped MAF files are suitable as input to
+[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
 ### Command line tools
 Man pages for command line tools:
+* [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
 * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
 * [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
 * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)

data/bin/maf_bgzip ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+require 'ostruct'
+require 'bio-maf'
+require 'bio-bgzf'
+$options = OpenStruct.new
+$options.dir = '.'
+$options.ref_only = true
+op = OptionParser.new do |opts|
+  opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
+  opts.separator ""
+  opts.separator "Options:"
+  opts.on("-d", "--dir DIR",
+          "Directory to write compressed MAF to",
+          "(default is current directory)") do |dir|
+    $options.dir = dir
+  end
+  opts.on("-i", "--index", "Index MAF files after writing") do
+    $options.index = true
+  end
+  opts.on("-a", "--all",
+          "Index all sequences, not just reference seq",
+          "(has no effect without --index)") do
+    $options.ref_only = false
+  end
+end
+op.parse!(ARGV)
+until ARGV.empty?
+  maf_path = ARGV.shift
+  maf_base = File.basename(maf_path)
+  base = maf_base.gsub(/\.maf.*/, '')
+  bgz_path = "#{$options.dir}/#{base}.maf.bgz"
+  p = Bio::MAF::Parser.new(maf_path,
+                           :parse_extended => true,
+                           :parse_empty => true)
+  File.open(bgz_path, 'w') do |out_f|
+    Bio::BGZF::Writer.new(out_f) do |bgz_w|
+      maf_w = Bio::MAF::Writer.new(bgz_w)
+      maf_w.write_header(p.header)
+      p.each_block do |block|
+        maf_w.write_block(block)
+      end
+    end
+  end
+  p.close
+  if $options.index
+    p2 = Bio::MAF::Parser.new(bgz_path)
+    idx_path = "#{$options.dir}/#{base}.kct"
+    Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
+  end
+end

data/bin/maf_index CHANGED Viewed

@@ -14,15 +14,9 @@ PRINTERS = {
 $options = OpenStruct.new
 $options.mode = :build
 $options.ref_only = true
-$options.reader = if RUBY_PLATFORM == 'java'
-                    Bio::MAF::ThreadedChunkReader
-                  else
-                    Bio::MAF::ChunkReader
-                  end
 def build_index(maf, index)
   parser = Bio::MAF::Parser.new(maf,
-                                :chunk_reader => $options.reader,
                                 :parse_extended => false)
   idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
   idx.close

data/bio-maf.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = "bio-maf"
-  s.version = "0.3.2"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Clayton Wheeler"]
-  s.date = "2012-07-26"
+  s.date = "2012-08-02"
   s.description = "Multiple Alignment Format parser for BioRuby."
   s.email = "cswh@umich.edu"
   s.extra_rdoc_files = [
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
   end
   s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
+  s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
   s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
   s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
   if RUBY_PLATFORM == 'java'

data/features/bgzf.feature ADDED Viewed

@@ -0,0 +1,62 @@
+Feature: BGZF compression
+  Because MAF files are large
+  We need random access
+  But we would also like to compress them
+  Yet common compression formats don't facilitate random access
+  So we use BGZF compression to support random access
+  To 64 KB chunks
+  @no_jruby
+  Scenario: Compress a MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    When I run `maf_bgzip mm8_chr7_tiny.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+  @no_jruby
+  Scenario: Compress and index a MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    When I run `maf_bgzip -i mm8_chr7_tiny.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Compress a gzipped MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf.gz |
+    When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+  @no_jruby
+  Scenario: Compress and index a gzipped MAF file
+    Given test files:
+    | mm8_chr7_tiny.maf.gz |
+    When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Compress multiple MAF files
+    Given test files:
+    | mm8_chr7_tiny.maf |
+    | mm8.chrM.maf      |
+    When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8_chr7_tiny.maf.bgz" should exist
+    And a file named "mm8.chrM.maf.bgz" should exist

data/features/maf-indexing.feature CHANGED Viewed

@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
     """
     And a file named "mm8_chr7_tiny.kct" should exist
+  @no_jruby
+  Scenario: Build MAF index on BGZF file with CLI tool
+    Given test files:
+    | mm8.chrM.maf.bgz |
+    When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
+    Then it should pass with:
+    """
+    """
+    And a file named "mm8.chrM.kct" should exist
   @no_jruby
   Scenario: Build MAF index on all sequences with CLI tool
     Given test files:

data/features/maf-querying.feature CHANGED Viewed

@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
     And search for blocks between positions 0 and 80100000 of mm8.chr7
     Then 3 blocks are obtained
+  @no_jruby
+  Scenario: Parse blocks from a BGZF-compressed file
+    Given test files:
+    | mm8.chrM.maf     |
+    | mm8.chrM.maf.bgz |
+    When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
+    And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
+    And I run `diff m1.maf m2.maf`
+    Then the exit status should be 0

data/features/step_definitions/convert_steps.rb CHANGED Viewed

@@ -6,7 +6,7 @@ end
 Given /^MAF data:$/ do |string|
   @src_f = Tempfile.new(['rspec', '.maf'])
   @src_f.write(string)
-  @src_f.close
+  @src_f.rewind
 end
 When /^I select FASTA output$/ do

data/features/support/env.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'bundler/setup'
 unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
   begin
     require 'simplecov'

data/lib/bio/maf/index.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'kyotocabinet'
 require 'jruby/profiler' if RUBY_PLATFORM == 'java'
+require 'bio-bgzf'
 #require 'bio-ucsc-api'
 require 'bio-genomic-interval'
@@ -189,18 +190,20 @@ module Bio
         @indices = {}
         @maf_by_chrom = {}
         if options[:dir]
-          @dir = options[:dir]
-          @maf_files = Dir.glob("#{@dir}/*.maf")
+          scan_dir(options[:dir])
         elsif options[:maf]
-          @maf_files = [options[:maf]]
           if options[:index]
             register_index(KyotoIndex.open(options[:index]),
                            options[:maf])
+          else
+            idx = find_index_file(options[:maf])
+            if idx
+              register_index(KyotoIndex.open(idx), options[:maf])
+            end
           end
         else
           raise "Must specify :dir or :maf!"
         end
-        scan_indices!
         if options[:maf] && @indices.empty?
           # MAF file explicitly given but no index
           # build a temporary one
@@ -215,23 +218,27 @@ module Bio
       # @api private
       def find_index_file(maf)
-        base = File.basename(maf, '.maf')
-        index_f = "#{@dir}/#{base}.kct"
-        File.exists?(index_f) ? index_f : nil
+        dir = File.dirname(maf)
+        base = File.basename(maf)
+        noext = base.gsub(/\.maf.*/, '')
+        idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
       end
       # @api private
       def register_index(index, maf)
+        unless index.maf_file == File.basename(maf)
+          raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
+        end
         @indices[index.ref_seq] = index
         @maf_by_chrom[index.ref_seq] = maf
       end
       # @api private
-      def scan_indices!
-        @maf_files.each do |maf|
-          index_f = find_index_file(maf)
-          if index_f
-            index = KyotoIndex.open(index_f)
+      def scan_dir(dir)
+        Dir.glob("#{dir}/*.kct").each do |index_f|
+          index = KyotoIndex.open(index_f)
+          maf = "#{dir}/#{index.maf_file}"
+          if File.exist? maf
             register_index(index, maf)
           end
         end
@@ -262,9 +269,12 @@ module Bio
     class KyotoIndex
       include KVHelpers
-      attr_reader :db, :species, :species_max_id, :ref_only
+      attr_reader :db, :species, :species_max_id, :ref_only, :path
+      attr_reader :maf_file
       attr_accessor :index_sequences, :ref_seq
+      COMPRESSION_KEY = 'bio-maf:compression'
+      FILE_KEY = 'bio-maf:file'
       FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
       FORMAT_VERSION = 2
       REF_SEQ_KEY = 'bio-maf:reference-sequence'
@@ -438,6 +448,7 @@ module Bio
           raise "Could not open DB file!"
         end
         if mode == KyotoCabinet::DB::OREADER
+          @maf_file = db[FILE_KEY]
           self.ref_seq = db[REF_SEQ_KEY]
           load_index_sequences
           load_species
@@ -450,6 +461,7 @@ module Bio
       end
       def dump(stream=$stdout)
+        bgzf = (db[COMPRESSION_KEY] == 'bgzf')
         stream.puts "KyotoIndex dump: #{@path}"
         stream.puts
         if db.count == 0
@@ -474,6 +486,11 @@ module Bio
             offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
             stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
             stream.puts "  offset #{offset}, length #{len}"
+            if bgzf
+              block = Bio::BGZF.vo_block_offset(offset)
+              data = Bio::BGZF.vo_data_offset(offset)
+              stream.puts "  BGZF block offset #{block}, data offset #{data}"
+            end
             stream.puts "  text size: #{text_size}"
             stream.puts "  sequences in block: #{n_seq}"
             stream.printf("  species vector: %016x\n", species_vec)
@@ -660,6 +677,11 @@ module Bio
       end
       def build(parser, ref_only=true)
+        db[FILE_KEY] = File.basename(parser.file_spec)
+        @maf_file = db[FILE_KEY]
+        if parser.compression
+          db[COMPRESSION_KEY] = parser.compression.to_s
+        end
         first_block = parser.parse_block
         self.ref_seq = first_block.sequences.first.source
         @ref_only = ref_only

data/lib/bio/maf/maf.rb CHANGED Viewed

@@ -103,8 +103,9 @@ module Bio
       GAP = /-+/
-      # Remove gaps present in all sequences. These would generally
+      # Find gaps present in all sequences. These would generally
       # occur when some sequences have been filtered out.
+      #
       # @see #remove_gaps!
       # @see Parser#sequence_filter
       def find_gaps
@@ -126,6 +127,7 @@ module Bio
       # Remove gaps present in all sequences. These would generally
       # occur when some sequences have been filtered out.
+      #
       # @see #find_gaps
       # @see Parser#sequence_filter
       def remove_gaps!

data/lib/bio/maf/parser.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'strscan'
 require 'java' if RUBY_PLATFORM == 'java'
+require 'bio-bgzf'
 # @api public
 module Bio
@@ -56,8 +57,7 @@ module Bio
       # Reads a chunk of the file.
       #
-      # Currently always reads size_hint bytes but this may change
-      # with BGZF support.
+      # Currently always reads size_hint bytes.
       #
       # @param [Integer] offset file offset to read from.
       # @param [Integer] size_hint desired size of chunk.
@@ -70,58 +70,97 @@ module Bio
       end
     end
-    # Variant ChunkReader using a read-ahead thread with internal
-    # queue for sequential parsing. Not useful for random-access
-    # parsing.
-    #
-    # Only beneficial on JRuby.
-    class ThreadedChunkReader < ChunkReader
+    class BGZFChunkReader
+      attr_reader :f, :r
-      def initialize(f, chunk_size, buffer_size=64)
-        super(f, chunk_size)
-        @buffer = SizedQueue.new(buffer_size)
-        @eof_reached = false
-        start_read_ahead
+      def initialize(f, _chunk_size)
+        @f = f
+        @r = Bio::BGZF::Reader.new(f)
       end
+      def pos
+        r.tell
+      end
+      def read_chunk
+        r.read_block
+      end
+      def read_chunk_at(vo, _size)
+        r.read_block_at(vo)
+      end
+    end
+    class ThreadedChunkReaderWrapper
+      attr_reader :cr, :pos
+      def initialize(cr, buffer_size=64)
+        @cr = cr
+        @buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
+        @eof_reached = false
+        @first_seq_read = false
+      end
       # Spawn a read-ahead thread. Called from {#initialize}.
       def start_read_ahead
         @read_thread = Thread.new { read_ahead }
       end
+      def f
+        cr.f
+      end
       # Read ahead into queue.
       def read_ahead
         # n = 0
         begin
-          f_pos = 0
           until f.eof?
-            chunk = f.read(@chunk_size)
-            @buffer << [f_pos, chunk]
-            f_pos += chunk.bytesize
-            # n += 1
-            # if (n % 100) == 0
-            #   $stderr.puts "buffer size: #{@buffer.size}"
-            # end
+            chunk = cr.read_chunk
+            c_pos = cr.pos
+            @buffer.put([c_pos, chunk])
           end
-          @eof_reached = true
+          @buffer.put(:eof)
+          # @eof_reached = true
         rescue Exception
           @read_ahead_ex = $!
-          $stderr.puts "read_ahead aborting: #{$!}"
+          LOG.error $!
+          @buffer.put($!)
         end
       end
-      # (see ChunkReader#read_chunk)
       def read_chunk
-        raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
-        if @eof_reached && @buffer.empty?
+        if ! @first_seq_read
+          # this is the first read_chunk call to read the header
+          # not necessarily indicative of sequential access
+          @first_seq_read = true
+          chunk = cr.read_chunk
+          @pos = cr.pos
+          return chunk
+        elsif @read_ahead_ex
+          raise @read_ahead_ex
+        elsif @eof_reached
           return nil
         else
-          c_pos, chunk = @buffer.shift()
-          @pos = c_pos
-          return chunk
+          start_read_ahead if @read_thread.nil?
+          e = @buffer.take
+          case
+          when e == :eof
+            @eof_reached = nil
+            return nil
+          when e.is_a?(Exception)
+            raise e
+          else
+            c_pos, chunk = e
+            @pos = c_pos
+            return chunk
+          end
         end
       end
+      def read_chunk_at(*args)
+        cr.read_chunk_at(*args)
+      end
     end
     # MAF parsing code useful for sequential and random-access parsing.
@@ -385,8 +424,7 @@ module Bio
         @f = fd
         @parser = parser
         @opts = parser.opts
-        reader = opts[:chunk_reader] || ChunkReader
-        @cr = reader.new(@f, chunk_size)
+        @cr = parser.base_reader.new(@f, chunk_size)
         @last_block_pos = -1
       end
@@ -413,6 +451,7 @@ module Bio
       # @return [Array<Block>]
       def fetch_blocks(offset, len, block_offsets)
         if block_given?
+          LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
           start_chunk_read_if_needed(offset, len)
           # read chunks until we have the entire merged set of
           # blocks ready to parse
@@ -420,6 +459,13 @@ module Bio
           append_chunks_to(len)
           # parse the blocks
           block_offsets.each do |expected_offset|
+            # skip ahead, in case there is a gap resulting from a
+            # block that is not being parsed
+            rel_offset = expected_offset - offset
+            if s.pos < rel_offset
+              s.pos = rel_offset
+            end
+            # now actually parse the block data
             block = _parse_block
             parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
             parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
@@ -444,7 +490,6 @@ module Bio
       end
       def append_chunks_to(len)
-        # XXX: need to rethink this for BGZF; prefetching ChunkReader
         while s.string.size < len
           s.string << cr.read_chunk()
         end
@@ -463,8 +508,6 @@ module Bio
     #  * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
     #  * `:merge_max`: merge up to this many bytes of blocks for
     #    random access
-    #  * `:chunk_reader`: use the specified class to read
-    #    chunks. (Only useful with {ThreadedChunkReader}).
     #  * `:threads`: number of threads to use for parallel
     #    parsing. Only useful under JRuby.
     # @api public
@@ -482,6 +525,9 @@ module Bio
       attr_reader :s
       # @return [ChunkReader] ChunkReader.
       attr_reader :cr
+      # @return [Class] ChunkReader class to use for random access
+      # @see ParseContext
+      attr_reader :base_reader
       # @return [Boolean] whether EOF has been reached.
       attr_reader :at_end
       # @return [Hash] parser options.
@@ -490,6 +536,8 @@ module Bio
       attr_reader :chunk_start
       # @return [Integer] offset of the last block start in this chunk.
       attr_reader :last_block_pos
+      # @return [Symbol] compression method used for this file, or nil
+      attr_reader :compression
       # @api private
       attr_accessor :parse_extended
@@ -515,10 +563,29 @@ module Bio
         @parse_extended = opts[:parse_extended] || false
         @parse_empty = opts[:parse_empty] || false
         @chunk_start = 0
-        @file_spec = file_spec
-        @f = File.open(file_spec)
-        reader = opts[:chunk_reader] || ChunkReader
-        @cr = reader.new(@f, chunk_size)
+        if file_spec.respond_to? :flush
+          # guess what, Pathnames respond to :read...
+          @f = file_spec
+          @file_spec = @f.path if @f.respond_to?(:path)
+          # TODO: gzip?
+        else
+          @file_spec = file_spec
+          if file_spec.to_s.end_with?(".maf.gz")
+            @f = IO.popen("gzip -dc #{file_spec}")
+          else
+            @f = File.open(file_spec)
+          end
+        end
+        if @file_spec.to_s =~ /\.bgzf?$/
+          @base_reader = BGZFChunkReader
+          @compression = :bgzf
+        else
+          @base_reader = ChunkReader
+        end
+        @cr = base_reader.new(@f, chunk_size)
+        if RUBY_PLATFORM == 'java'
+          @cr = ThreadedChunkReaderWrapper.new(@cr)
+        end
         @s = StringScanner.new(cr.read_chunk())
         set_last_block_pos!
         @at_end = false
@@ -536,7 +603,11 @@ module Bio
       # @api private
       def context(chunk_size)
         # IO#dup calls dup(2) internally, but seems broken on JRuby...
-        fd = File.open(file_spec)
+        if file_spec
+          fd = File.open(file_spec)
+        else
+          fd = f.dup
+        end
         ParseContext.new(fd, chunk_size, self)
       end
@@ -679,6 +750,15 @@ module Bio
       #
       # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
       def merge_fetch_list(orig_fl)
+        case compression
+        when nil
+          _merge_fetch_list(orig_fl)
+        when :bgzf
+          _merge_bgzf_fetch_list(orig_fl)
+        end
+      end
+      def _merge_fetch_list(orig_fl)
         fl = orig_fl.dup
         r = []
         until fl.empty? do
@@ -698,6 +778,22 @@ module Bio
         return r
       end
+      # Build a merged fetch list in a BGZF-aware way.  This will
+      # group together all MAF blocks from a single BGZF block. These
+      # MAF blocks may not be consecutive.
+      def _merge_bgzf_fetch_list(orig_fl)
+        block_e = orig_fl.chunk { |entry|
+          Bio::BGZF::vo_block_offset(entry[0])
+        }
+        block_e.collect do |bgzf_block, fl|
+          # text size to read from disk, from the start of the first
+          # block to the end of the last block
+          text_size = fl.last[0] + fl.last[1] - fl.first[0]
+          offsets = fl.collect { |e| e[0] }
+          [fl.first[0], text_size, offsets]
+        end
+      end
       # Parse the header of the MAF file.
       def _parse_header
         parse_error("not a MAF file") unless s.scan(/##maf\s*/)