bio-maf 0.3.2-java → 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -3,6 +3,9 @@ source "http://rubygems.org"
3
3
 
4
4
  gemspec
5
5
 
6
+ ## for local development
7
+ #gem "bio-bgzf", :path => "../bioruby-bgzf"
8
+
6
9
  # Add dependencies to develop your gem here.
7
10
  # Include everything needed to run rake, tests, features, etc.
8
11
  group :development do
data/README.md CHANGED
@@ -8,9 +8,9 @@ support for the
8
8
  (MAF), used in bioinformatics to store whole-genome sets of multiple
9
9
  sequence alignments.
10
10
 
11
- Ultimately it will provide indexed and sequential access to MAF data,
12
- as well as performing various manipulations on it and writing modified
13
- MAF files. So far, it only supports simple sequential parsing.
11
+ This library provides indexed and sequential access to MAF data, as
12
+ well as performing various manipulations on it and writing modified
13
+ MAF files.
14
14
 
15
15
  For more information, see the
16
16
  [project wiki](https://github.com/csw/bioruby-maf/wiki).
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
94
94
  idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
95
95
  ```
96
96
 
97
+ ### Compress and index a MAF file
98
+
99
+ This library fully supports [BGZF][]-compressed MAF files, which
100
+ combine gzip compression with blocking for efficient random
101
+ access. These can be generated with blocking optimized for MAF access
102
+ using the included
103
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
104
+ tool. This writes BGZF-compressed MAF files and optionally indexes
105
+ them as well:
106
+
107
+ $ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
108
+
109
+ This is the easiest way to prepare MAF files for use with this library.
110
+
97
111
  ### Extract blocks from an indexed MAF file, by genomic interval
98
112
 
99
113
  Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
352
366
  end
353
367
  ```
354
368
 
369
+ ### Compression
370
+
371
+ MAF files can optionally be compressed in the [BGZF][] format defined
372
+ in the [SAM specification][]. This is best done with
373
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
374
+ but files compressed with the `bgzip(1)` tool from samtools will also
375
+ work, though less efficiently.
376
+
377
+ [BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
378
+ [SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
379
+
380
+ MAF files compressed with plain gzip will be decompressed on the fly,
381
+ but random access to these files will not be possible. However,
382
+ gzipped MAF files are suitable as input to
383
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
384
+
355
385
  ### Command line tools
356
386
 
357
387
  Man pages for command line tools:
358
388
 
389
+ * [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
359
390
  * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
360
391
  * [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
361
392
  * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
data/bin/maf_bgzip ADDED
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'ostruct'
4
+
5
+ require 'bio-maf'
6
+ require 'bio-bgzf'
7
+
8
+ $options = OpenStruct.new
9
+ $options.dir = '.'
10
+ $options.ref_only = true
11
+
12
+ op = OptionParser.new do |opts|
13
+ opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
14
+ opts.separator ""
15
+ opts.separator "Options:"
16
+ opts.on("-d", "--dir DIR",
17
+ "Directory to write compressed MAF to",
18
+ "(default is current directory)") do |dir|
19
+ $options.dir = dir
20
+ end
21
+ opts.on("-i", "--index", "Index MAF files after writing") do
22
+ $options.index = true
23
+ end
24
+ opts.on("-a", "--all",
25
+ "Index all sequences, not just reference seq",
26
+ "(has no effect without --index)") do
27
+ $options.ref_only = false
28
+ end
29
+ end
30
+
31
+ op.parse!(ARGV)
32
+
33
+ until ARGV.empty?
34
+ maf_path = ARGV.shift
35
+ maf_base = File.basename(maf_path)
36
+ base = maf_base.gsub(/\.maf.*/, '')
37
+ bgz_path = "#{$options.dir}/#{base}.maf.bgz"
38
+ p = Bio::MAF::Parser.new(maf_path,
39
+ :parse_extended => true,
40
+ :parse_empty => true)
41
+ File.open(bgz_path, 'w') do |out_f|
42
+ Bio::BGZF::Writer.new(out_f) do |bgz_w|
43
+ maf_w = Bio::MAF::Writer.new(bgz_w)
44
+ maf_w.write_header(p.header)
45
+ p.each_block do |block|
46
+ maf_w.write_block(block)
47
+ end
48
+ end
49
+ end
50
+ p.close
51
+ if $options.index
52
+ p2 = Bio::MAF::Parser.new(bgz_path)
53
+ idx_path = "#{$options.dir}/#{base}.kct"
54
+ Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
55
+ end
56
+ end
data/bin/maf_index CHANGED
@@ -14,15 +14,9 @@ PRINTERS = {
14
14
  $options = OpenStruct.new
15
15
  $options.mode = :build
16
16
  $options.ref_only = true
17
- $options.reader = if RUBY_PLATFORM == 'java'
18
- Bio::MAF::ThreadedChunkReader
19
- else
20
- Bio::MAF::ChunkReader
21
- end
22
17
 
23
18
  def build_index(maf, index)
24
19
  parser = Bio::MAF::Parser.new(maf,
25
- :chunk_reader => $options.reader,
26
20
  :parse_extended => false)
27
21
  idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
28
22
  idx.close
data/bio-maf.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "bio-maf"
5
- s.version = "0.3.2"
5
+ s.version = "1.0.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Clayton Wheeler"]
9
- s.date = "2012-07-26"
9
+ s.date = "2012-08-02"
10
10
  s.description = "Multiple Alignment Format parser for BioRuby."
11
11
  s.email = "cswh@umich.edu"
12
12
  s.extra_rdoc_files = [
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
32
32
  end
33
33
 
34
34
  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
35
+ s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
35
36
  s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
36
37
  s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
37
38
  if RUBY_PLATFORM == 'java'
@@ -0,0 +1,62 @@
1
+ Feature: BGZF compression
2
+ Because MAF files are large
3
+ We need random access
4
+ But we would also like to compress them
5
+ Yet common compression formats don't facilitate random access
6
+ So we use BGZF compression to support random access
7
+ To 64 KB chunks
8
+
9
+ @no_jruby
10
+ Scenario: Compress a MAF file
11
+ Given test files:
12
+ | mm8_chr7_tiny.maf |
13
+ When I run `maf_bgzip mm8_chr7_tiny.maf`
14
+ Then it should pass with:
15
+ """
16
+ """
17
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
18
+
19
+ @no_jruby
20
+ Scenario: Compress and index a MAF file
21
+ Given test files:
22
+ | mm8_chr7_tiny.maf |
23
+ When I run `maf_bgzip -i mm8_chr7_tiny.maf`
24
+ Then it should pass with:
25
+ """
26
+ """
27
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
28
+ And a file named "mm8_chr7_tiny.kct" should exist
29
+
30
+ @no_jruby
31
+ Scenario: Compress a gzipped MAF file
32
+ Given test files:
33
+ | mm8_chr7_tiny.maf.gz |
34
+ When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
35
+ Then it should pass with:
36
+ """
37
+ """
38
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
39
+
40
+ @no_jruby
41
+ Scenario: Compress and index a gzipped MAF file
42
+ Given test files:
43
+ | mm8_chr7_tiny.maf.gz |
44
+ When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
45
+ Then it should pass with:
46
+ """
47
+ """
48
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
49
+ And a file named "mm8_chr7_tiny.kct" should exist
50
+
51
+ @no_jruby
52
+ Scenario: Compress multiple MAF files
53
+ Given test files:
54
+ | mm8_chr7_tiny.maf |
55
+ | mm8.chrM.maf |
56
+ When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
57
+ Then it should pass with:
58
+ """
59
+ """
60
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
61
+ And a file named "mm8.chrM.maf.bgz" should exist
62
+
@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
49
49
  """
50
50
  And a file named "mm8_chr7_tiny.kct" should exist
51
51
 
52
+ @no_jruby
53
+ Scenario: Build MAF index on BGZF file with CLI tool
54
+ Given test files:
55
+ | mm8.chrM.maf.bgz |
56
+ When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
57
+ Then it should pass with:
58
+ """
59
+ """
60
+ And a file named "mm8.chrM.kct" should exist
61
+
52
62
  @no_jruby
53
63
  Scenario: Build MAF index on all sequences with CLI tool
54
64
  Given test files:
@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
73
73
  And search for blocks between positions 0 and 80100000 of mm8.chr7
74
74
  Then 3 blocks are obtained
75
75
 
76
+ @no_jruby
77
+ Scenario: Parse blocks from a BGZF-compressed file
78
+ Given test files:
79
+ | mm8.chrM.maf |
80
+ | mm8.chrM.maf.bgz |
81
+ When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
82
+ And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
83
+ And I run `diff m1.maf m2.maf`
84
+ Then the exit status should be 0
@@ -6,7 +6,7 @@ end
6
6
  Given /^MAF data:$/ do |string|
7
7
  @src_f = Tempfile.new(['rspec', '.maf'])
8
8
  @src_f.write(string)
9
- @src_f.close
9
+ @src_f.rewind
10
10
  end
11
11
 
12
12
  When /^I select FASTA output$/ do
@@ -1,3 +1,5 @@
1
+ require 'bundler/setup'
2
+
1
3
  unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
4
  begin
3
5
  require 'simplecov'
data/lib/bio/maf/index.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'kyotocabinet'
2
2
  require 'jruby/profiler' if RUBY_PLATFORM == 'java'
3
+ require 'bio-bgzf'
3
4
 
4
5
  #require 'bio-ucsc-api'
5
6
  require 'bio-genomic-interval'
@@ -189,18 +190,20 @@ module Bio
189
190
  @indices = {}
190
191
  @maf_by_chrom = {}
191
192
  if options[:dir]
192
- @dir = options[:dir]
193
- @maf_files = Dir.glob("#{@dir}/*.maf")
193
+ scan_dir(options[:dir])
194
194
  elsif options[:maf]
195
- @maf_files = [options[:maf]]
196
195
  if options[:index]
197
196
  register_index(KyotoIndex.open(options[:index]),
198
197
  options[:maf])
198
+ else
199
+ idx = find_index_file(options[:maf])
200
+ if idx
201
+ register_index(KyotoIndex.open(idx), options[:maf])
202
+ end
199
203
  end
200
204
  else
201
205
  raise "Must specify :dir or :maf!"
202
206
  end
203
- scan_indices!
204
207
  if options[:maf] && @indices.empty?
205
208
  # MAF file explicitly given but no index
206
209
  # build a temporary one
@@ -215,23 +218,27 @@ module Bio
215
218
 
216
219
  # @api private
217
220
  def find_index_file(maf)
218
- base = File.basename(maf, '.maf')
219
- index_f = "#{@dir}/#{base}.kct"
220
- File.exists?(index_f) ? index_f : nil
221
+ dir = File.dirname(maf)
222
+ base = File.basename(maf)
223
+ noext = base.gsub(/\.maf.*/, '')
224
+ idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
221
225
  end
222
226
 
223
227
  # @api private
224
228
  def register_index(index, maf)
229
+ unless index.maf_file == File.basename(maf)
230
+ raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
231
+ end
225
232
  @indices[index.ref_seq] = index
226
233
  @maf_by_chrom[index.ref_seq] = maf
227
234
  end
228
235
 
229
236
  # @api private
230
- def scan_indices!
231
- @maf_files.each do |maf|
232
- index_f = find_index_file(maf)
233
- if index_f
234
- index = KyotoIndex.open(index_f)
237
+ def scan_dir(dir)
238
+ Dir.glob("#{dir}/*.kct").each do |index_f|
239
+ index = KyotoIndex.open(index_f)
240
+ maf = "#{dir}/#{index.maf_file}"
241
+ if File.exist? maf
235
242
  register_index(index, maf)
236
243
  end
237
244
  end
@@ -262,9 +269,12 @@ module Bio
262
269
  class KyotoIndex
263
270
  include KVHelpers
264
271
 
265
- attr_reader :db, :species, :species_max_id, :ref_only
272
+ attr_reader :db, :species, :species_max_id, :ref_only, :path
273
+ attr_reader :maf_file
266
274
  attr_accessor :index_sequences, :ref_seq
267
275
 
276
+ COMPRESSION_KEY = 'bio-maf:compression'
277
+ FILE_KEY = 'bio-maf:file'
268
278
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
269
279
  FORMAT_VERSION = 2
270
280
  REF_SEQ_KEY = 'bio-maf:reference-sequence'
@@ -438,6 +448,7 @@ module Bio
438
448
  raise "Could not open DB file!"
439
449
  end
440
450
  if mode == KyotoCabinet::DB::OREADER
451
+ @maf_file = db[FILE_KEY]
441
452
  self.ref_seq = db[REF_SEQ_KEY]
442
453
  load_index_sequences
443
454
  load_species
@@ -450,6 +461,7 @@ module Bio
450
461
  end
451
462
 
452
463
  def dump(stream=$stdout)
464
+ bgzf = (db[COMPRESSION_KEY] == 'bgzf')
453
465
  stream.puts "KyotoIndex dump: #{@path}"
454
466
  stream.puts
455
467
  if db.count == 0
@@ -474,6 +486,11 @@ module Bio
474
486
  offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
475
487
  stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
476
488
  stream.puts " offset #{offset}, length #{len}"
489
+ if bgzf
490
+ block = Bio::BGZF.vo_block_offset(offset)
491
+ data = Bio::BGZF.vo_data_offset(offset)
492
+ stream.puts " BGZF block offset #{block}, data offset #{data}"
493
+ end
477
494
  stream.puts " text size: #{text_size}"
478
495
  stream.puts " sequences in block: #{n_seq}"
479
496
  stream.printf(" species vector: %016x\n", species_vec)
@@ -660,6 +677,11 @@ module Bio
660
677
  end
661
678
 
662
679
  def build(parser, ref_only=true)
680
+ db[FILE_KEY] = File.basename(parser.file_spec)
681
+ @maf_file = db[FILE_KEY]
682
+ if parser.compression
683
+ db[COMPRESSION_KEY] = parser.compression.to_s
684
+ end
663
685
  first_block = parser.parse_block
664
686
  self.ref_seq = first_block.sequences.first.source
665
687
  @ref_only = ref_only
data/lib/bio/maf/maf.rb CHANGED
@@ -103,8 +103,9 @@ module Bio
103
103
 
104
104
  GAP = /-+/
105
105
 
106
- # Remove gaps present in all sequences. These would generally
106
+ # Find gaps present in all sequences. These would generally
107
107
  # occur when some sequences have been filtered out.
108
+ #
108
109
  # @see #remove_gaps!
109
110
  # @see Parser#sequence_filter
110
111
  def find_gaps
@@ -126,6 +127,7 @@ module Bio
126
127
 
127
128
  # Remove gaps present in all sequences. These would generally
128
129
  # occur when some sequences have been filtered out.
130
+ #
129
131
  # @see #find_gaps
130
132
  # @see Parser#sequence_filter
131
133
  def remove_gaps!
@@ -1,5 +1,6 @@
1
1
  require 'strscan'
2
2
  require 'java' if RUBY_PLATFORM == 'java'
3
+ require 'bio-bgzf'
3
4
 
4
5
  # @api public
5
6
  module Bio
@@ -56,8 +57,7 @@ module Bio
56
57
 
57
58
  # Reads a chunk of the file.
58
59
  #
59
- # Currently always reads size_hint bytes but this may change
60
- # with BGZF support.
60
+ # Currently always reads size_hint bytes.
61
61
  #
62
62
  # @param [Integer] offset file offset to read from.
63
63
  # @param [Integer] size_hint desired size of chunk.
@@ -70,58 +70,97 @@ module Bio
70
70
  end
71
71
  end
72
72
 
73
- # Variant ChunkReader using a read-ahead thread with internal
74
- # queue for sequential parsing. Not useful for random-access
75
- # parsing.
76
- #
77
- # Only beneficial on JRuby.
78
- class ThreadedChunkReader < ChunkReader
73
+ class BGZFChunkReader
74
+ attr_reader :f, :r
79
75
 
80
- def initialize(f, chunk_size, buffer_size=64)
81
- super(f, chunk_size)
82
- @buffer = SizedQueue.new(buffer_size)
83
- @eof_reached = false
84
- start_read_ahead
76
+ def initialize(f, _chunk_size)
77
+ @f = f
78
+ @r = Bio::BGZF::Reader.new(f)
85
79
  end
86
80
 
81
+ def pos
82
+ r.tell
83
+ end
84
+
85
+ def read_chunk
86
+ r.read_block
87
+ end
88
+
89
+ def read_chunk_at(vo, _size)
90
+ r.read_block_at(vo)
91
+ end
92
+ end
93
+
94
+ class ThreadedChunkReaderWrapper
95
+
96
+ attr_reader :cr, :pos
97
+
98
+ def initialize(cr, buffer_size=64)
99
+ @cr = cr
100
+ @buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
101
+ @eof_reached = false
102
+ @first_seq_read = false
103
+ end
104
+
87
105
  # Spawn a read-ahead thread. Called from {#initialize}.
88
106
  def start_read_ahead
89
107
  @read_thread = Thread.new { read_ahead }
90
108
  end
91
109
 
110
+ def f
111
+ cr.f
112
+ end
113
+
92
114
  # Read ahead into queue.
93
115
  def read_ahead
94
116
  # n = 0
95
117
  begin
96
- f_pos = 0
97
118
  until f.eof?
98
- chunk = f.read(@chunk_size)
99
- @buffer << [f_pos, chunk]
100
- f_pos += chunk.bytesize
101
- # n += 1
102
- # if (n % 100) == 0
103
- # $stderr.puts "buffer size: #{@buffer.size}"
104
- # end
119
+ chunk = cr.read_chunk
120
+ c_pos = cr.pos
121
+ @buffer.put([c_pos, chunk])
105
122
  end
106
- @eof_reached = true
123
+ @buffer.put(:eof)
124
+ # @eof_reached = true
107
125
  rescue Exception
108
126
  @read_ahead_ex = $!
109
- $stderr.puts "read_ahead aborting: #{$!}"
127
+ LOG.error $!
128
+ @buffer.put($!)
110
129
  end
111
130
  end
112
131
 
113
- # (see ChunkReader#read_chunk)
114
132
  def read_chunk
115
- raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
116
- if @eof_reached && @buffer.empty?
133
+ if ! @first_seq_read
134
+ # this is the first read_chunk call to read the header
135
+ # not necessarily indicative of sequential access
136
+ @first_seq_read = true
137
+ chunk = cr.read_chunk
138
+ @pos = cr.pos
139
+ return chunk
140
+ elsif @read_ahead_ex
141
+ raise @read_ahead_ex
142
+ elsif @eof_reached
117
143
  return nil
118
144
  else
119
- c_pos, chunk = @buffer.shift()
120
- @pos = c_pos
121
- return chunk
145
+ start_read_ahead if @read_thread.nil?
146
+ e = @buffer.take
147
+ case
148
+ when e == :eof
149
+ @eof_reached = nil
150
+ return nil
151
+ when e.is_a?(Exception)
152
+ raise e
153
+ else
154
+ c_pos, chunk = e
155
+ @pos = c_pos
156
+ return chunk
157
+ end
122
158
  end
123
159
  end
124
160
 
161
+ def read_chunk_at(*args)
162
+ cr.read_chunk_at(*args)
163
+ end
125
164
  end
126
165
 
127
166
  # MAF parsing code useful for sequential and random-access parsing.
@@ -385,8 +424,7 @@ module Bio
385
424
  @f = fd
386
425
  @parser = parser
387
426
  @opts = parser.opts
388
- reader = opts[:chunk_reader] || ChunkReader
389
- @cr = reader.new(@f, chunk_size)
427
+ @cr = parser.base_reader.new(@f, chunk_size)
390
428
  @last_block_pos = -1
391
429
  end
392
430
 
@@ -413,6 +451,7 @@ module Bio
413
451
  # @return [Array<Block>]
414
452
  def fetch_blocks(offset, len, block_offsets)
415
453
  if block_given?
454
+ LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
416
455
  start_chunk_read_if_needed(offset, len)
417
456
  # read chunks until we have the entire merged set of
418
457
  # blocks ready to parse
@@ -420,6 +459,13 @@ module Bio
420
459
  append_chunks_to(len)
421
460
  # parse the blocks
422
461
  block_offsets.each do |expected_offset|
462
+ # skip ahead, in case there is a gap resulting from a
463
+ # block that is not being parsed
464
+ rel_offset = expected_offset - offset
465
+ if s.pos < rel_offset
466
+ s.pos = rel_offset
467
+ end
468
+ # now actually parse the block data
423
469
  block = _parse_block
424
470
  parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
425
471
  parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
@@ -444,7 +490,6 @@ module Bio
444
490
  end
445
491
 
446
492
  def append_chunks_to(len)
447
- # XXX: need to rethink this for BGZF; prefetching ChunkReader
448
493
  while s.string.size < len
449
494
  s.string << cr.read_chunk()
450
495
  end
@@ -463,8 +508,6 @@ module Bio
463
508
  # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
464
509
  # * `:merge_max`: merge up to this many bytes of blocks for
465
510
  # random access
466
- # * `:chunk_reader`: use the specified class to read
467
- # chunks. (Only useful with {ThreadedChunkReader}).
468
511
  # * `:threads`: number of threads to use for parallel
469
512
  # parsing. Only useful under JRuby.
470
513
  # @api public
@@ -482,6 +525,9 @@ module Bio
482
525
  attr_reader :s
483
526
  # @return [ChunkReader] ChunkReader.
484
527
  attr_reader :cr
528
+ # @return [Class] ChunkReader class to use for random access
529
+ # @see ParseContext
530
+ attr_reader :base_reader
485
531
  # @return [Boolean] whether EOF has been reached.
486
532
  attr_reader :at_end
487
533
  # @return [Hash] parser options.
@@ -490,6 +536,8 @@ module Bio
490
536
  attr_reader :chunk_start
491
537
  # @return [Integer] offset of the last block start in this chunk.
492
538
  attr_reader :last_block_pos
539
+ # @return [Symbol] compression method used for this file, or nil
540
+ attr_reader :compression
493
541
 
494
542
  # @api private
495
543
  attr_accessor :parse_extended
@@ -515,10 +563,29 @@ module Bio
515
563
  @parse_extended = opts[:parse_extended] || false
516
564
  @parse_empty = opts[:parse_empty] || false
517
565
  @chunk_start = 0
518
- @file_spec = file_spec
519
- @f = File.open(file_spec)
520
- reader = opts[:chunk_reader] || ChunkReader
521
- @cr = reader.new(@f, chunk_size)
566
+ if file_spec.respond_to? :flush
567
+ # guess what, Pathnames respond to :read...
568
+ @f = file_spec
569
+ @file_spec = @f.path if @f.respond_to?(:path)
570
+ # TODO: gzip?
571
+ else
572
+ @file_spec = file_spec
573
+ if file_spec.to_s.end_with?(".maf.gz")
574
+ @f = IO.popen("gzip -dc #{file_spec}")
575
+ else
576
+ @f = File.open(file_spec)
577
+ end
578
+ end
579
+ if @file_spec.to_s =~ /\.bgzf?$/
580
+ @base_reader = BGZFChunkReader
581
+ @compression = :bgzf
582
+ else
583
+ @base_reader = ChunkReader
584
+ end
585
+ @cr = base_reader.new(@f, chunk_size)
586
+ if RUBY_PLATFORM == 'java'
587
+ @cr = ThreadedChunkReaderWrapper.new(@cr)
588
+ end
522
589
  @s = StringScanner.new(cr.read_chunk())
523
590
  set_last_block_pos!
524
591
  @at_end = false
@@ -536,7 +603,11 @@ module Bio
536
603
  # @api private
537
604
  def context(chunk_size)
538
605
  # IO#dup calls dup(2) internally, but seems broken on JRuby...
539
- fd = File.open(file_spec)
606
+ if file_spec
607
+ fd = File.open(file_spec)
608
+ else
609
+ fd = f.dup
610
+ end
540
611
  ParseContext.new(fd, chunk_size, self)
541
612
  end
542
613
 
@@ -679,6 +750,15 @@ module Bio
679
750
  #
680
751
  # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
681
752
  def merge_fetch_list(orig_fl)
753
+ case compression
754
+ when nil
755
+ _merge_fetch_list(orig_fl)
756
+ when :bgzf
757
+ _merge_bgzf_fetch_list(orig_fl)
758
+ end
759
+ end
760
+
761
+ def _merge_fetch_list(orig_fl)
682
762
  fl = orig_fl.dup
683
763
  r = []
684
764
  until fl.empty? do
@@ -698,6 +778,22 @@ module Bio
698
778
  return r
699
779
  end
700
780
 
781
+ # Build a merged fetch list in a BGZF-aware way. This will
782
+ # group together all MAF blocks from a single BGZF block. These
783
+ # MAF blocks may not be consecutive.
784
+ def _merge_bgzf_fetch_list(orig_fl)
785
+ block_e = orig_fl.chunk { |entry|
786
+ Bio::BGZF::vo_block_offset(entry[0])
787
+ }
788
+ block_e.collect do |bgzf_block, fl|
789
+ # text size to read from disk, from the start of the first
790
+ # block to the end of the last block
791
+ text_size = fl.last[0] + fl.last[1] - fl.first[0]
792
+ offsets = fl.collect { |e| e[0] }
793
+ [fl.first[0], text_size, offsets]
794
+ end
795
+ end
796
+
701
797
  # Parse the header of the MAF file.
702
798
  def _parse_header
703
799
  parse_error("not a MAF file") unless s.scan(/##maf\s*/)