bio-maf 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -3,6 +3,9 @@ source "http://rubygems.org"
3
3
 
4
4
  gemspec
5
5
 
6
+ ## for local development
7
+ #gem "bio-bgzf", :path => "../bioruby-bgzf"
8
+
6
9
  # Add dependencies to develop your gem here.
7
10
  # Include everything needed to run rake, tests, features, etc.
8
11
  group :development do
data/README.md CHANGED
@@ -8,9 +8,9 @@ support for the
8
8
  (MAF), used in bioinformatics to store whole-genome sets of multiple
9
9
  sequence alignments.
10
10
 
11
- Ultimately it will provide indexed and sequential access to MAF data,
12
- as well as performing various manipulations on it and writing modified
13
- MAF files. So far, it only supports simple sequential parsing.
11
+ This library provides indexed and sequential access to MAF data, as
12
+ well as performing various manipulations on it and writing modified
13
+ MAF files.
14
14
 
15
15
  For more information, see the
16
16
  [project wiki](https://github.com/csw/bioruby-maf/wiki).
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
94
94
  idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
95
95
  ```
96
96
 
97
+ ### Compress and index a MAF file
98
+
99
+ This library fully supports [BGZF][]-compressed MAF files, which
100
+ combine gzip compression with blocking for efficient random
101
+ access. These can be generated with blocking optimized for MAF access
102
+ using the included
103
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
104
+ tool. This writes BGZF-compressed MAF files and optionally indexes
105
+ them as well:
106
+
107
+ $ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
108
+
109
+ This is the easiest way to prepare MAF files for use with this library.
110
+
97
111
  ### Extract blocks from an indexed MAF file, by genomic interval
98
112
 
99
113
  Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
352
366
  end
353
367
  ```
354
368
 
369
+ ### Compression
370
+
371
+ MAF files can optionally be compressed in the [BGZF][] format defined
372
+ in the [SAM specification][]. This is best done with
373
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
374
+ but files compressed with the `bgzip(1)` tool from samtools will also
375
+ work, though less efficiently.
376
+
377
+ [BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
378
+ [SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
379
+
380
+ MAF files compressed with plain gzip will be decompressed on the fly,
381
+ but random access to these files will not be possible. However,
382
+ gzipped MAF files are suitable as input to
383
+ [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
384
+
355
385
  ### Command line tools
356
386
 
357
387
  Man pages for command line tools:
358
388
 
389
+ * [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
359
390
  * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
360
391
  * [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
361
392
  * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'ostruct'
4
+
5
+ require 'bio-maf'
6
+ require 'bio-bgzf'
7
+
8
+ $options = OpenStruct.new
9
+ $options.dir = '.'
10
+ $options.ref_only = true
11
+
12
+ op = OptionParser.new do |opts|
13
+ opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
14
+ opts.separator ""
15
+ opts.separator "Options:"
16
+ opts.on("-d", "--dir DIR",
17
+ "Directory to write compressed MAF to",
18
+ "(default is current directory)") do |dir|
19
+ $options.dir = dir
20
+ end
21
+ opts.on("-i", "--index", "Index MAF files after writing") do
22
+ $options.index = true
23
+ end
24
+ opts.on("-a", "--all",
25
+ "Index all sequences, not just reference seq",
26
+ "(has no effect without --index)") do
27
+ $options.ref_only = false
28
+ end
29
+ end
30
+
31
+ op.parse!(ARGV)
32
+
33
+ until ARGV.empty?
34
+ maf_path = ARGV.shift
35
+ maf_base = File.basename(maf_path)
36
+ base = maf_base.gsub(/\.maf.*/, '')
37
+ bgz_path = "#{$options.dir}/#{base}.maf.bgz"
38
+ p = Bio::MAF::Parser.new(maf_path,
39
+ :parse_extended => true,
40
+ :parse_empty => true)
41
+ File.open(bgz_path, 'w') do |out_f|
42
+ Bio::BGZF::Writer.new(out_f) do |bgz_w|
43
+ maf_w = Bio::MAF::Writer.new(bgz_w)
44
+ maf_w.write_header(p.header)
45
+ p.each_block do |block|
46
+ maf_w.write_block(block)
47
+ end
48
+ end
49
+ end
50
+ p.close
51
+ if $options.index
52
+ p2 = Bio::MAF::Parser.new(bgz_path)
53
+ idx_path = "#{$options.dir}/#{base}.kct"
54
+ Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
55
+ end
56
+ end
@@ -14,15 +14,9 @@ PRINTERS = {
14
14
  $options = OpenStruct.new
15
15
  $options.mode = :build
16
16
  $options.ref_only = true
17
- $options.reader = if RUBY_PLATFORM == 'java'
18
- Bio::MAF::ThreadedChunkReader
19
- else
20
- Bio::MAF::ChunkReader
21
- end
22
17
 
23
18
  def build_index(maf, index)
24
19
  parser = Bio::MAF::Parser.new(maf,
25
- :chunk_reader => $options.reader,
26
20
  :parse_extended => false)
27
21
  idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
28
22
  idx.close
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "bio-maf"
5
- s.version = "0.3.2"
5
+ s.version = "1.0.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Clayton Wheeler"]
9
- s.date = "2012-07-26"
9
+ s.date = "2012-08-02"
10
10
  s.description = "Multiple Alignment Format parser for BioRuby."
11
11
  s.email = "cswh@umich.edu"
12
12
  s.extra_rdoc_files = [
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
32
32
  end
33
33
 
34
34
  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
35
+ s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
35
36
  s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
36
37
  s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
37
38
  if RUBY_PLATFORM == 'java'
@@ -0,0 +1,62 @@
1
+ Feature: BGZF compression
2
+ Because MAF files are large
3
+ We need random access
4
+ But we would also like to compress them
5
+ Yet common compression formats don't facilitate random access
6
+ So we use BGZF compression to support random access
7
+ To 64 KB chunks
8
+
9
+ @no_jruby
10
+ Scenario: Compress a MAF file
11
+ Given test files:
12
+ | mm8_chr7_tiny.maf |
13
+ When I run `maf_bgzip mm8_chr7_tiny.maf`
14
+ Then it should pass with:
15
+ """
16
+ """
17
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
18
+
19
+ @no_jruby
20
+ Scenario: Compress and index a MAF file
21
+ Given test files:
22
+ | mm8_chr7_tiny.maf |
23
+ When I run `maf_bgzip -i mm8_chr7_tiny.maf`
24
+ Then it should pass with:
25
+ """
26
+ """
27
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
28
+ And a file named "mm8_chr7_tiny.kct" should exist
29
+
30
+ @no_jruby
31
+ Scenario: Compress a gzipped MAF file
32
+ Given test files:
33
+ | mm8_chr7_tiny.maf.gz |
34
+ When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
35
+ Then it should pass with:
36
+ """
37
+ """
38
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
39
+
40
+ @no_jruby
41
+ Scenario: Compress and index a gzipped MAF file
42
+ Given test files:
43
+ | mm8_chr7_tiny.maf.gz |
44
+ When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
45
+ Then it should pass with:
46
+ """
47
+ """
48
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
49
+ And a file named "mm8_chr7_tiny.kct" should exist
50
+
51
+ @no_jruby
52
+ Scenario: Compress multiple MAF files
53
+ Given test files:
54
+ | mm8_chr7_tiny.maf |
55
+ | mm8.chrM.maf |
56
+ When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
57
+ Then it should pass with:
58
+ """
59
+ """
60
+ And a file named "mm8_chr7_tiny.maf.bgz" should exist
61
+ And a file named "mm8.chrM.maf.bgz" should exist
62
+
@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
49
49
  """
50
50
  And a file named "mm8_chr7_tiny.kct" should exist
51
51
 
52
+ @no_jruby
53
+ Scenario: Build MAF index on BGZF file with CLI tool
54
+ Given test files:
55
+ | mm8.chrM.maf.bgz |
56
+ When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
57
+ Then it should pass with:
58
+ """
59
+ """
60
+ And a file named "mm8.chrM.kct" should exist
61
+
52
62
  @no_jruby
53
63
  Scenario: Build MAF index on all sequences with CLI tool
54
64
  Given test files:
@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
73
73
  And search for blocks between positions 0 and 80100000 of mm8.chr7
74
74
  Then 3 blocks are obtained
75
75
 
76
+ @no_jruby
77
+ Scenario: Parse blocks from a BGZF-compressed file
78
+ Given test files:
79
+ | mm8.chrM.maf |
80
+ | mm8.chrM.maf.bgz |
81
+ When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
82
+ And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
83
+ And I run `diff m1.maf m2.maf`
84
+ Then the exit status should be 0
@@ -6,7 +6,7 @@ end
6
6
  Given /^MAF data:$/ do |string|
7
7
  @src_f = Tempfile.new(['rspec', '.maf'])
8
8
  @src_f.write(string)
9
- @src_f.close
9
+ @src_f.rewind
10
10
  end
11
11
 
12
12
  When /^I select FASTA output$/ do
@@ -1,3 +1,5 @@
1
+ require 'bundler/setup'
2
+
1
3
  unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
4
  begin
3
5
  require 'simplecov'
@@ -1,5 +1,6 @@
1
1
  require 'kyotocabinet'
2
2
  require 'jruby/profiler' if RUBY_PLATFORM == 'java'
3
+ require 'bio-bgzf'
3
4
 
4
5
  #require 'bio-ucsc-api'
5
6
  require 'bio-genomic-interval'
@@ -189,18 +190,20 @@ module Bio
189
190
  @indices = {}
190
191
  @maf_by_chrom = {}
191
192
  if options[:dir]
192
- @dir = options[:dir]
193
- @maf_files = Dir.glob("#{@dir}/*.maf")
193
+ scan_dir(options[:dir])
194
194
  elsif options[:maf]
195
- @maf_files = [options[:maf]]
196
195
  if options[:index]
197
196
  register_index(KyotoIndex.open(options[:index]),
198
197
  options[:maf])
198
+ else
199
+ idx = find_index_file(options[:maf])
200
+ if idx
201
+ register_index(KyotoIndex.open(idx), options[:maf])
202
+ end
199
203
  end
200
204
  else
201
205
  raise "Must specify :dir or :maf!"
202
206
  end
203
- scan_indices!
204
207
  if options[:maf] && @indices.empty?
205
208
  # MAF file explicitly given but no index
206
209
  # build a temporary one
@@ -215,23 +218,27 @@ module Bio
215
218
 
216
219
  # @api private
217
220
  def find_index_file(maf)
218
- base = File.basename(maf, '.maf')
219
- index_f = "#{@dir}/#{base}.kct"
220
- File.exists?(index_f) ? index_f : nil
221
+ dir = File.dirname(maf)
222
+ base = File.basename(maf)
223
+ noext = base.gsub(/\.maf.*/, '')
224
+ idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
221
225
  end
222
226
 
223
227
  # @api private
224
228
  def register_index(index, maf)
229
+ unless index.maf_file == File.basename(maf)
230
+ raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
231
+ end
225
232
  @indices[index.ref_seq] = index
226
233
  @maf_by_chrom[index.ref_seq] = maf
227
234
  end
228
235
 
229
236
  # @api private
230
- def scan_indices!
231
- @maf_files.each do |maf|
232
- index_f = find_index_file(maf)
233
- if index_f
234
- index = KyotoIndex.open(index_f)
237
+ def scan_dir(dir)
238
+ Dir.glob("#{dir}/*.kct").each do |index_f|
239
+ index = KyotoIndex.open(index_f)
240
+ maf = "#{dir}/#{index.maf_file}"
241
+ if File.exist? maf
235
242
  register_index(index, maf)
236
243
  end
237
244
  end
@@ -262,9 +269,12 @@ module Bio
262
269
  class KyotoIndex
263
270
  include KVHelpers
264
271
 
265
- attr_reader :db, :species, :species_max_id, :ref_only
272
+ attr_reader :db, :species, :species_max_id, :ref_only, :path
273
+ attr_reader :maf_file
266
274
  attr_accessor :index_sequences, :ref_seq
267
275
 
276
+ COMPRESSION_KEY = 'bio-maf:compression'
277
+ FILE_KEY = 'bio-maf:file'
268
278
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
269
279
  FORMAT_VERSION = 2
270
280
  REF_SEQ_KEY = 'bio-maf:reference-sequence'
@@ -438,6 +448,7 @@ module Bio
438
448
  raise "Could not open DB file!"
439
449
  end
440
450
  if mode == KyotoCabinet::DB::OREADER
451
+ @maf_file = db[FILE_KEY]
441
452
  self.ref_seq = db[REF_SEQ_KEY]
442
453
  load_index_sequences
443
454
  load_species
@@ -450,6 +461,7 @@ module Bio
450
461
  end
451
462
 
452
463
  def dump(stream=$stdout)
464
+ bgzf = (db[COMPRESSION_KEY] == 'bgzf')
453
465
  stream.puts "KyotoIndex dump: #{@path}"
454
466
  stream.puts
455
467
  if db.count == 0
@@ -474,6 +486,11 @@ module Bio
474
486
  offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
475
487
  stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
476
488
  stream.puts " offset #{offset}, length #{len}"
489
+ if bgzf
490
+ block = Bio::BGZF.vo_block_offset(offset)
491
+ data = Bio::BGZF.vo_data_offset(offset)
492
+ stream.puts " BGZF block offset #{block}, data offset #{data}"
493
+ end
477
494
  stream.puts " text size: #{text_size}"
478
495
  stream.puts " sequences in block: #{n_seq}"
479
496
  stream.printf(" species vector: %016x\n", species_vec)
@@ -660,6 +677,11 @@ module Bio
660
677
  end
661
678
 
662
679
  def build(parser, ref_only=true)
680
+ db[FILE_KEY] = File.basename(parser.file_spec)
681
+ @maf_file = db[FILE_KEY]
682
+ if parser.compression
683
+ db[COMPRESSION_KEY] = parser.compression.to_s
684
+ end
663
685
  first_block = parser.parse_block
664
686
  self.ref_seq = first_block.sequences.first.source
665
687
  @ref_only = ref_only
@@ -103,8 +103,9 @@ module Bio
103
103
 
104
104
  GAP = /-+/
105
105
 
106
- # Remove gaps present in all sequences. These would generally
106
+ # Find gaps present in all sequences. These would generally
107
107
  # occur when some sequences have been filtered out.
108
+ #
108
109
  # @see #remove_gaps!
109
110
  # @see Parser#sequence_filter
110
111
  def find_gaps
@@ -126,6 +127,7 @@ module Bio
126
127
 
127
128
  # Remove gaps present in all sequences. These would generally
128
129
  # occur when some sequences have been filtered out.
130
+ #
129
131
  # @see #find_gaps
130
132
  # @see Parser#sequence_filter
131
133
  def remove_gaps!
@@ -1,5 +1,6 @@
1
1
  require 'strscan'
2
2
  require 'java' if RUBY_PLATFORM == 'java'
3
+ require 'bio-bgzf'
3
4
 
4
5
  # @api public
5
6
  module Bio
@@ -56,8 +57,7 @@ module Bio
56
57
 
57
58
  # Reads a chunk of the file.
58
59
  #
59
- # Currently always reads size_hint bytes but this may change
60
- # with BGZF support.
60
+ # Currently always reads size_hint bytes.
61
61
  #
62
62
  # @param [Integer] offset file offset to read from.
63
63
  # @param [Integer] size_hint desired size of chunk.
@@ -70,58 +70,97 @@ module Bio
70
70
  end
71
71
  end
72
72
 
73
- # Variant ChunkReader using a read-ahead thread with internal
74
- # queue for sequential parsing. Not useful for random-access
75
- # parsing.
76
- #
77
- # Only beneficial on JRuby.
78
- class ThreadedChunkReader < ChunkReader
73
+ class BGZFChunkReader
74
+ attr_reader :f, :r
79
75
 
80
- def initialize(f, chunk_size, buffer_size=64)
81
- super(f, chunk_size)
82
- @buffer = SizedQueue.new(buffer_size)
83
- @eof_reached = false
84
- start_read_ahead
76
+ def initialize(f, _chunk_size)
77
+ @f = f
78
+ @r = Bio::BGZF::Reader.new(f)
85
79
  end
86
80
 
81
+ def pos
82
+ r.tell
83
+ end
84
+
85
+ def read_chunk
86
+ r.read_block
87
+ end
88
+
89
+ def read_chunk_at(vo, _size)
90
+ r.read_block_at(vo)
91
+ end
92
+ end
93
+
94
+ class ThreadedChunkReaderWrapper
95
+
96
+ attr_reader :cr, :pos
97
+
98
+ def initialize(cr, buffer_size=64)
99
+ @cr = cr
100
+ @buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
101
+ @eof_reached = false
102
+ @first_seq_read = false
103
+ end
104
+
87
105
  # Spawn a read-ahead thread. Called from {#initialize}.
88
106
  def start_read_ahead
89
107
  @read_thread = Thread.new { read_ahead }
90
108
  end
91
109
 
110
+ def f
111
+ cr.f
112
+ end
113
+
92
114
  # Read ahead into queue.
93
115
  def read_ahead
94
116
  # n = 0
95
117
  begin
96
- f_pos = 0
97
118
  until f.eof?
98
- chunk = f.read(@chunk_size)
99
- @buffer << [f_pos, chunk]
100
- f_pos += chunk.bytesize
101
- # n += 1
102
- # if (n % 100) == 0
103
- # $stderr.puts "buffer size: #{@buffer.size}"
104
- # end
119
+ chunk = cr.read_chunk
120
+ c_pos = cr.pos
121
+ @buffer.put([c_pos, chunk])
105
122
  end
106
- @eof_reached = true
123
+ @buffer.put(:eof)
124
+ # @eof_reached = true
107
125
  rescue Exception
108
126
  @read_ahead_ex = $!
109
- $stderr.puts "read_ahead aborting: #{$!}"
127
+ LOG.error $!
128
+ @buffer.put($!)
110
129
  end
111
130
  end
112
131
 
113
- # (see ChunkReader#read_chunk)
114
132
  def read_chunk
115
- raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
116
- if @eof_reached && @buffer.empty?
133
+ if ! @first_seq_read
134
+ # this is the first read_chunk call to read the header
135
+ # not necessarily indicative of sequential access
136
+ @first_seq_read = true
137
+ chunk = cr.read_chunk
138
+ @pos = cr.pos
139
+ return chunk
140
+ elsif @read_ahead_ex
141
+ raise @read_ahead_ex
142
+ elsif @eof_reached
117
143
  return nil
118
144
  else
119
- c_pos, chunk = @buffer.shift()
120
- @pos = c_pos
121
- return chunk
145
+ start_read_ahead if @read_thread.nil?
146
+ e = @buffer.take
147
+ case
148
+ when e == :eof
149
+ @eof_reached = nil
150
+ return nil
151
+ when e.is_a?(Exception)
152
+ raise e
153
+ else
154
+ c_pos, chunk = e
155
+ @pos = c_pos
156
+ return chunk
157
+ end
122
158
  end
123
159
  end
124
160
 
161
+ def read_chunk_at(*args)
162
+ cr.read_chunk_at(*args)
163
+ end
125
164
  end
126
165
 
127
166
  # MAF parsing code useful for sequential and random-access parsing.
@@ -385,8 +424,7 @@ module Bio
385
424
  @f = fd
386
425
  @parser = parser
387
426
  @opts = parser.opts
388
- reader = opts[:chunk_reader] || ChunkReader
389
- @cr = reader.new(@f, chunk_size)
427
+ @cr = parser.base_reader.new(@f, chunk_size)
390
428
  @last_block_pos = -1
391
429
  end
392
430
 
@@ -413,6 +451,7 @@ module Bio
413
451
  # @return [Array<Block>]
414
452
  def fetch_blocks(offset, len, block_offsets)
415
453
  if block_given?
454
+ LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
416
455
  start_chunk_read_if_needed(offset, len)
417
456
  # read chunks until we have the entire merged set of
418
457
  # blocks ready to parse
@@ -420,6 +459,13 @@ module Bio
420
459
  append_chunks_to(len)
421
460
  # parse the blocks
422
461
  block_offsets.each do |expected_offset|
462
+ # skip ahead, in case there is a gap resulting from a
463
+ # block that is not being parsed
464
+ rel_offset = expected_offset - offset
465
+ if s.pos < rel_offset
466
+ s.pos = rel_offset
467
+ end
468
+ # now actually parse the block data
423
469
  block = _parse_block
424
470
  parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
425
471
  parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
@@ -444,7 +490,6 @@ module Bio
444
490
  end
445
491
 
446
492
  def append_chunks_to(len)
447
- # XXX: need to rethink this for BGZF; prefetching ChunkReader
448
493
  while s.string.size < len
449
494
  s.string << cr.read_chunk()
450
495
  end
@@ -463,8 +508,6 @@ module Bio
463
508
  # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
464
509
  # * `:merge_max`: merge up to this many bytes of blocks for
465
510
  # random access
466
- # * `:chunk_reader`: use the specified class to read
467
- # chunks. (Only useful with {ThreadedChunkReader}).
468
511
  # * `:threads`: number of threads to use for parallel
469
512
  # parsing. Only useful under JRuby.
470
513
  # @api public
@@ -482,6 +525,9 @@ module Bio
482
525
  attr_reader :s
483
526
  # @return [ChunkReader] ChunkReader.
484
527
  attr_reader :cr
528
+ # @return [Class] ChunkReader class to use for random access
529
+ # @see ParseContext
530
+ attr_reader :base_reader
485
531
  # @return [Boolean] whether EOF has been reached.
486
532
  attr_reader :at_end
487
533
  # @return [Hash] parser options.
@@ -490,6 +536,8 @@ module Bio
490
536
  attr_reader :chunk_start
491
537
  # @return [Integer] offset of the last block start in this chunk.
492
538
  attr_reader :last_block_pos
539
+ # @return [Symbol] compression method used for this file, or nil
540
+ attr_reader :compression
493
541
 
494
542
  # @api private
495
543
  attr_accessor :parse_extended
@@ -515,10 +563,29 @@ module Bio
515
563
  @parse_extended = opts[:parse_extended] || false
516
564
  @parse_empty = opts[:parse_empty] || false
517
565
  @chunk_start = 0
518
- @file_spec = file_spec
519
- @f = File.open(file_spec)
520
- reader = opts[:chunk_reader] || ChunkReader
521
- @cr = reader.new(@f, chunk_size)
566
+ if file_spec.respond_to? :flush
567
+ # guess what, Pathnames respond to :read...
568
+ @f = file_spec
569
+ @file_spec = @f.path if @f.respond_to?(:path)
570
+ # TODO: gzip?
571
+ else
572
+ @file_spec = file_spec
573
+ if file_spec.to_s.end_with?(".maf.gz")
574
+ @f = IO.popen("gzip -dc #{file_spec}")
575
+ else
576
+ @f = File.open(file_spec)
577
+ end
578
+ end
579
+ if @file_spec.to_s =~ /\.bgzf?$/
580
+ @base_reader = BGZFChunkReader
581
+ @compression = :bgzf
582
+ else
583
+ @base_reader = ChunkReader
584
+ end
585
+ @cr = base_reader.new(@f, chunk_size)
586
+ if RUBY_PLATFORM == 'java'
587
+ @cr = ThreadedChunkReaderWrapper.new(@cr)
588
+ end
522
589
  @s = StringScanner.new(cr.read_chunk())
523
590
  set_last_block_pos!
524
591
  @at_end = false
@@ -536,7 +603,11 @@ module Bio
536
603
  # @api private
537
604
  def context(chunk_size)
538
605
  # IO#dup calls dup(2) internally, but seems broken on JRuby...
539
- fd = File.open(file_spec)
606
+ if file_spec
607
+ fd = File.open(file_spec)
608
+ else
609
+ fd = f.dup
610
+ end
540
611
  ParseContext.new(fd, chunk_size, self)
541
612
  end
542
613
 
@@ -679,6 +750,15 @@ module Bio
679
750
  #
680
751
  # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
681
752
  def merge_fetch_list(orig_fl)
753
+ case compression
754
+ when nil
755
+ _merge_fetch_list(orig_fl)
756
+ when :bgzf
757
+ _merge_bgzf_fetch_list(orig_fl)
758
+ end
759
+ end
760
+
761
+ def _merge_fetch_list(orig_fl)
682
762
  fl = orig_fl.dup
683
763
  r = []
684
764
  until fl.empty? do
@@ -698,6 +778,22 @@ module Bio
698
778
  return r
699
779
  end
700
780
 
781
+ # Build a merged fetch list in a BGZF-aware way. This will
782
+ # group together all MAF blocks from a single BGZF block. These
783
+ # MAF blocks may not be consecutive.
784
+ def _merge_bgzf_fetch_list(orig_fl)
785
+ block_e = orig_fl.chunk { |entry|
786
+ Bio::BGZF::vo_block_offset(entry[0])
787
+ }
788
+ block_e.collect do |bgzf_block, fl|
789
+ # text size to read from disk, from the start of the first
790
+ # block to the end of the last block
791
+ text_size = fl.last[0] + fl.last[1] - fl.first[0]
792
+ offsets = fl.collect { |e| e[0] }
793
+ [fl.first[0], text_size, offsets]
794
+ end
795
+ end
796
+
701
797
  # Parse the header of the MAF file.
702
798
  def _parse_header
703
799
  parse_error("not a MAF file") unless s.scan(/##maf\s*/)