bio-maf 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/README.md +34 -3
- data/bin/maf_bgzip +56 -0
- data/bin/maf_index +0 -6
- data/bio-maf.gemspec +3 -2
- data/features/bgzf.feature +62 -0
- data/features/maf-indexing.feature +10 -0
- data/features/maf-querying.feature +9 -0
- data/features/step_definitions/convert_steps.rb +1 -1
- data/features/support/env.rb +2 -0
- data/lib/bio/maf/index.rb +35 -13
- data/lib/bio/maf/maf.rb +3 -1
- data/lib/bio/maf/parser.rb +135 -39
- data/lib/bio/maf/writer.rb +4 -4
- data/man/maf_bgzip.1 +101 -0
- data/man/maf_bgzip.1.ronn +85 -0
- data/man/maf_extract.1 +5 -2
- data/man/maf_extract.1.ronn +4 -1
- data/man/maf_index.1 +3 -3
- data/man/maf_index.1.ronn +3 -2
- data/man/maf_tile.1 +5 -2
- data/man/maf_tile.1.ronn +4 -1
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +7 -0
- data/spec/bio/maf/parser_spec.rb +13 -0
- data/spec/spec_helper.rb +2 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8.chrM.maf +2421 -0
- data/test/data/mm8.chrM.maf.bgz +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf.gz +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +31 -3
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -8,9 +8,9 @@ support for the
|
|
8
8
|
(MAF), used in bioinformatics to store whole-genome sets of multiple
|
9
9
|
sequence alignments.
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
MAF files.
|
11
|
+
This library provides indexed and sequential access to MAF data, as
|
12
|
+
well as performing various manipulations on it and writing modified
|
13
|
+
MAF files.
|
14
14
|
|
15
15
|
For more information, see the
|
16
16
|
[project wiki](https://github.com/csw/bioruby-maf/wiki).
|
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
|
|
94
94
|
idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
|
95
95
|
```
|
96
96
|
|
97
|
+
### Compress and index a MAF file
|
98
|
+
|
99
|
+
This library fully supports [BGZF][]-compressed MAF files, which
|
100
|
+
combine gzip compression with blocking for efficient random
|
101
|
+
access. These can be generated with blocking optimized for MAF access
|
102
|
+
using the included
|
103
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
|
104
|
+
tool. This writes BGZF-compressed MAF files and optionally indexes
|
105
|
+
them as well:
|
106
|
+
|
107
|
+
$ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
|
108
|
+
|
109
|
+
This is the easiest way to prepare MAF files for use with this library.
|
110
|
+
|
97
111
|
### Extract blocks from an indexed MAF file, by genomic interval
|
98
112
|
|
99
113
|
Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
|
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
|
|
352
366
|
end
|
353
367
|
```
|
354
368
|
|
369
|
+
### Compression
|
370
|
+
|
371
|
+
MAF files can optionally be compressed in the [BGZF][] format defined
|
372
|
+
in the [SAM specification][]. This is best done with
|
373
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
|
374
|
+
but files compressed with the `bgzip(1)` tool from samtools will also
|
375
|
+
work, though less efficiently.
|
376
|
+
|
377
|
+
[BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
|
378
|
+
[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
|
379
|
+
|
380
|
+
MAF files compressed with plain gzip will be decompressed on the fly,
|
381
|
+
but random access to these files will not be possible. However,
|
382
|
+
gzipped MAF files are suitable as input to
|
383
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
|
384
|
+
|
355
385
|
### Command line tools
|
356
386
|
|
357
387
|
Man pages for command line tools:
|
358
388
|
|
389
|
+
* [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
|
359
390
|
* [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
|
360
391
|
* [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
|
361
392
|
* [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
|
data/bin/maf_bgzip
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
require 'bio-maf'
|
6
|
+
require 'bio-bgzf'
|
7
|
+
|
8
|
+
$options = OpenStruct.new
|
9
|
+
$options.dir = '.'
|
10
|
+
$options.ref_only = true
|
11
|
+
|
12
|
+
op = OptionParser.new do |opts|
|
13
|
+
opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Options:"
|
16
|
+
opts.on("-d", "--dir DIR",
|
17
|
+
"Directory to write compressed MAF to",
|
18
|
+
"(default is current directory)") do |dir|
|
19
|
+
$options.dir = dir
|
20
|
+
end
|
21
|
+
opts.on("-i", "--index", "Index MAF files after writing") do
|
22
|
+
$options.index = true
|
23
|
+
end
|
24
|
+
opts.on("-a", "--all",
|
25
|
+
"Index all sequences, not just reference seq",
|
26
|
+
"(has no effect without --index)") do
|
27
|
+
$options.ref_only = false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
op.parse!(ARGV)
|
32
|
+
|
33
|
+
until ARGV.empty?
|
34
|
+
maf_path = ARGV.shift
|
35
|
+
maf_base = File.basename(maf_path)
|
36
|
+
base = maf_base.gsub(/\.maf.*/, '')
|
37
|
+
bgz_path = "#{$options.dir}/#{base}.maf.bgz"
|
38
|
+
p = Bio::MAF::Parser.new(maf_path,
|
39
|
+
:parse_extended => true,
|
40
|
+
:parse_empty => true)
|
41
|
+
File.open(bgz_path, 'w') do |out_f|
|
42
|
+
Bio::BGZF::Writer.new(out_f) do |bgz_w|
|
43
|
+
maf_w = Bio::MAF::Writer.new(bgz_w)
|
44
|
+
maf_w.write_header(p.header)
|
45
|
+
p.each_block do |block|
|
46
|
+
maf_w.write_block(block)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
p.close
|
51
|
+
if $options.index
|
52
|
+
p2 = Bio::MAF::Parser.new(bgz_path)
|
53
|
+
idx_path = "#{$options.dir}/#{base}.kct"
|
54
|
+
Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
|
55
|
+
end
|
56
|
+
end
|
data/bin/maf_index
CHANGED
@@ -14,15 +14,9 @@ PRINTERS = {
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
16
|
$options.ref_only = true
|
17
|
-
$options.reader = if RUBY_PLATFORM == 'java'
|
18
|
-
Bio::MAF::ThreadedChunkReader
|
19
|
-
else
|
20
|
-
Bio::MAF::ChunkReader
|
21
|
-
end
|
22
17
|
|
23
18
|
def build_index(maf, index)
|
24
19
|
parser = Bio::MAF::Parser.new(maf,
|
25
|
-
:chunk_reader => $options.reader,
|
26
20
|
:parse_extended => false)
|
27
21
|
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
28
22
|
idx.close
|
data/bio-maf.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "1.0.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-
|
9
|
+
s.date = "2012-08-02"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
12
|
s.extra_rdoc_files = [
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
end
|
33
33
|
|
34
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
35
|
+
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
|
35
36
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
36
37
|
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
37
38
|
if RUBY_PLATFORM == 'java'
|
@@ -0,0 +1,62 @@
|
|
1
|
+
Feature: BGZF compression
|
2
|
+
Because MAF files are large
|
3
|
+
We need random access
|
4
|
+
But we would also like to compress them
|
5
|
+
Yet common compression formats don't facilitate random access
|
6
|
+
So we use BGZF compression to support random access
|
7
|
+
To 64 KB chunks
|
8
|
+
|
9
|
+
@no_jruby
|
10
|
+
Scenario: Compress a MAF file
|
11
|
+
Given test files:
|
12
|
+
| mm8_chr7_tiny.maf |
|
13
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf`
|
14
|
+
Then it should pass with:
|
15
|
+
"""
|
16
|
+
"""
|
17
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
18
|
+
|
19
|
+
@no_jruby
|
20
|
+
Scenario: Compress and index a MAF file
|
21
|
+
Given test files:
|
22
|
+
| mm8_chr7_tiny.maf |
|
23
|
+
When I run `maf_bgzip -i mm8_chr7_tiny.maf`
|
24
|
+
Then it should pass with:
|
25
|
+
"""
|
26
|
+
"""
|
27
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
28
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
29
|
+
|
30
|
+
@no_jruby
|
31
|
+
Scenario: Compress a gzipped MAF file
|
32
|
+
Given test files:
|
33
|
+
| mm8_chr7_tiny.maf.gz |
|
34
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
|
35
|
+
Then it should pass with:
|
36
|
+
"""
|
37
|
+
"""
|
38
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
39
|
+
|
40
|
+
@no_jruby
|
41
|
+
Scenario: Compress and index a gzipped MAF file
|
42
|
+
Given test files:
|
43
|
+
| mm8_chr7_tiny.maf.gz |
|
44
|
+
When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
|
45
|
+
Then it should pass with:
|
46
|
+
"""
|
47
|
+
"""
|
48
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
49
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
50
|
+
|
51
|
+
@no_jruby
|
52
|
+
Scenario: Compress multiple MAF files
|
53
|
+
Given test files:
|
54
|
+
| mm8_chr7_tiny.maf |
|
55
|
+
| mm8.chrM.maf |
|
56
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
|
57
|
+
Then it should pass with:
|
58
|
+
"""
|
59
|
+
"""
|
60
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
61
|
+
And a file named "mm8.chrM.maf.bgz" should exist
|
62
|
+
|
@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
|
|
49
49
|
"""
|
50
50
|
And a file named "mm8_chr7_tiny.kct" should exist
|
51
51
|
|
52
|
+
@no_jruby
|
53
|
+
Scenario: Build MAF index on BGZF file with CLI tool
|
54
|
+
Given test files:
|
55
|
+
| mm8.chrM.maf.bgz |
|
56
|
+
When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
|
57
|
+
Then it should pass with:
|
58
|
+
"""
|
59
|
+
"""
|
60
|
+
And a file named "mm8.chrM.kct" should exist
|
61
|
+
|
52
62
|
@no_jruby
|
53
63
|
Scenario: Build MAF index on all sequences with CLI tool
|
54
64
|
Given test files:
|
@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
|
|
73
73
|
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
74
74
|
Then 3 blocks are obtained
|
75
75
|
|
76
|
+
@no_jruby
|
77
|
+
Scenario: Parse blocks from a BGZF-compressed file
|
78
|
+
Given test files:
|
79
|
+
| mm8.chrM.maf |
|
80
|
+
| mm8.chrM.maf.bgz |
|
81
|
+
When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
|
82
|
+
And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
|
83
|
+
And I run `diff m1.maf m2.maf`
|
84
|
+
Then the exit status should be 0
|
data/features/support/env.rb
CHANGED
data/lib/bio/maf/index.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'kyotocabinet'
|
2
2
|
require 'jruby/profiler' if RUBY_PLATFORM == 'java'
|
3
|
+
require 'bio-bgzf'
|
3
4
|
|
4
5
|
#require 'bio-ucsc-api'
|
5
6
|
require 'bio-genomic-interval'
|
@@ -189,18 +190,20 @@ module Bio
|
|
189
190
|
@indices = {}
|
190
191
|
@maf_by_chrom = {}
|
191
192
|
if options[:dir]
|
192
|
-
|
193
|
-
@maf_files = Dir.glob("#{@dir}/*.maf")
|
193
|
+
scan_dir(options[:dir])
|
194
194
|
elsif options[:maf]
|
195
|
-
@maf_files = [options[:maf]]
|
196
195
|
if options[:index]
|
197
196
|
register_index(KyotoIndex.open(options[:index]),
|
198
197
|
options[:maf])
|
198
|
+
else
|
199
|
+
idx = find_index_file(options[:maf])
|
200
|
+
if idx
|
201
|
+
register_index(KyotoIndex.open(idx), options[:maf])
|
202
|
+
end
|
199
203
|
end
|
200
204
|
else
|
201
205
|
raise "Must specify :dir or :maf!"
|
202
206
|
end
|
203
|
-
scan_indices!
|
204
207
|
if options[:maf] && @indices.empty?
|
205
208
|
# MAF file explicitly given but no index
|
206
209
|
# build a temporary one
|
@@ -215,23 +218,27 @@ module Bio
|
|
215
218
|
|
216
219
|
# @api private
|
217
220
|
def find_index_file(maf)
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
+
dir = File.dirname(maf)
|
222
|
+
base = File.basename(maf)
|
223
|
+
noext = base.gsub(/\.maf.*/, '')
|
224
|
+
idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
|
221
225
|
end
|
222
226
|
|
223
227
|
# @api private
|
224
228
|
def register_index(index, maf)
|
229
|
+
unless index.maf_file == File.basename(maf)
|
230
|
+
raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
|
231
|
+
end
|
225
232
|
@indices[index.ref_seq] = index
|
226
233
|
@maf_by_chrom[index.ref_seq] = maf
|
227
234
|
end
|
228
235
|
|
229
236
|
# @api private
|
230
|
-
def
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
237
|
+
def scan_dir(dir)
|
238
|
+
Dir.glob("#{dir}/*.kct").each do |index_f|
|
239
|
+
index = KyotoIndex.open(index_f)
|
240
|
+
maf = "#{dir}/#{index.maf_file}"
|
241
|
+
if File.exist? maf
|
235
242
|
register_index(index, maf)
|
236
243
|
end
|
237
244
|
end
|
@@ -262,9 +269,12 @@ module Bio
|
|
262
269
|
class KyotoIndex
|
263
270
|
include KVHelpers
|
264
271
|
|
265
|
-
attr_reader :db, :species, :species_max_id, :ref_only
|
272
|
+
attr_reader :db, :species, :species_max_id, :ref_only, :path
|
273
|
+
attr_reader :maf_file
|
266
274
|
attr_accessor :index_sequences, :ref_seq
|
267
275
|
|
276
|
+
COMPRESSION_KEY = 'bio-maf:compression'
|
277
|
+
FILE_KEY = 'bio-maf:file'
|
268
278
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
269
279
|
FORMAT_VERSION = 2
|
270
280
|
REF_SEQ_KEY = 'bio-maf:reference-sequence'
|
@@ -438,6 +448,7 @@ module Bio
|
|
438
448
|
raise "Could not open DB file!"
|
439
449
|
end
|
440
450
|
if mode == KyotoCabinet::DB::OREADER
|
451
|
+
@maf_file = db[FILE_KEY]
|
441
452
|
self.ref_seq = db[REF_SEQ_KEY]
|
442
453
|
load_index_sequences
|
443
454
|
load_species
|
@@ -450,6 +461,7 @@ module Bio
|
|
450
461
|
end
|
451
462
|
|
452
463
|
def dump(stream=$stdout)
|
464
|
+
bgzf = (db[COMPRESSION_KEY] == 'bgzf')
|
453
465
|
stream.puts "KyotoIndex dump: #{@path}"
|
454
466
|
stream.puts
|
455
467
|
if db.count == 0
|
@@ -474,6 +486,11 @@ module Bio
|
|
474
486
|
offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
|
475
487
|
stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
|
476
488
|
stream.puts " offset #{offset}, length #{len}"
|
489
|
+
if bgzf
|
490
|
+
block = Bio::BGZF.vo_block_offset(offset)
|
491
|
+
data = Bio::BGZF.vo_data_offset(offset)
|
492
|
+
stream.puts " BGZF block offset #{block}, data offset #{data}"
|
493
|
+
end
|
477
494
|
stream.puts " text size: #{text_size}"
|
478
495
|
stream.puts " sequences in block: #{n_seq}"
|
479
496
|
stream.printf(" species vector: %016x\n", species_vec)
|
@@ -660,6 +677,11 @@ module Bio
|
|
660
677
|
end
|
661
678
|
|
662
679
|
def build(parser, ref_only=true)
|
680
|
+
db[FILE_KEY] = File.basename(parser.file_spec)
|
681
|
+
@maf_file = db[FILE_KEY]
|
682
|
+
if parser.compression
|
683
|
+
db[COMPRESSION_KEY] = parser.compression.to_s
|
684
|
+
end
|
663
685
|
first_block = parser.parse_block
|
664
686
|
self.ref_seq = first_block.sequences.first.source
|
665
687
|
@ref_only = ref_only
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -103,8 +103,9 @@ module Bio
|
|
103
103
|
|
104
104
|
GAP = /-+/
|
105
105
|
|
106
|
-
#
|
106
|
+
# Find gaps present in all sequences. These would generally
|
107
107
|
# occur when some sequences have been filtered out.
|
108
|
+
#
|
108
109
|
# @see #remove_gaps!
|
109
110
|
# @see Parser#sequence_filter
|
110
111
|
def find_gaps
|
@@ -126,6 +127,7 @@ module Bio
|
|
126
127
|
|
127
128
|
# Remove gaps present in all sequences. These would generally
|
128
129
|
# occur when some sequences have been filtered out.
|
130
|
+
#
|
129
131
|
# @see #find_gaps
|
130
132
|
# @see Parser#sequence_filter
|
131
133
|
def remove_gaps!
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'strscan'
|
2
2
|
require 'java' if RUBY_PLATFORM == 'java'
|
3
|
+
require 'bio-bgzf'
|
3
4
|
|
4
5
|
# @api public
|
5
6
|
module Bio
|
@@ -56,8 +57,7 @@ module Bio
|
|
56
57
|
|
57
58
|
# Reads a chunk of the file.
|
58
59
|
#
|
59
|
-
# Currently always reads size_hint bytes
|
60
|
-
# with BGZF support.
|
60
|
+
# Currently always reads size_hint bytes.
|
61
61
|
#
|
62
62
|
# @param [Integer] offset file offset to read from.
|
63
63
|
# @param [Integer] size_hint desired size of chunk.
|
@@ -70,58 +70,97 @@ module Bio
|
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
# parsing.
|
76
|
-
#
|
77
|
-
# Only beneficial on JRuby.
|
78
|
-
class ThreadedChunkReader < ChunkReader
|
73
|
+
class BGZFChunkReader
|
74
|
+
attr_reader :f, :r
|
79
75
|
|
80
|
-
def initialize(f,
|
81
|
-
|
82
|
-
@
|
83
|
-
@eof_reached = false
|
84
|
-
start_read_ahead
|
76
|
+
def initialize(f, _chunk_size)
|
77
|
+
@f = f
|
78
|
+
@r = Bio::BGZF::Reader.new(f)
|
85
79
|
end
|
86
80
|
|
81
|
+
def pos
|
82
|
+
r.tell
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_chunk
|
86
|
+
r.read_block
|
87
|
+
end
|
88
|
+
|
89
|
+
def read_chunk_at(vo, _size)
|
90
|
+
r.read_block_at(vo)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class ThreadedChunkReaderWrapper
|
95
|
+
|
96
|
+
attr_reader :cr, :pos
|
97
|
+
|
98
|
+
def initialize(cr, buffer_size=64)
|
99
|
+
@cr = cr
|
100
|
+
@buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
|
101
|
+
@eof_reached = false
|
102
|
+
@first_seq_read = false
|
103
|
+
end
|
104
|
+
|
87
105
|
# Spawn a read-ahead thread. Called from {#initialize}.
|
88
106
|
def start_read_ahead
|
89
107
|
@read_thread = Thread.new { read_ahead }
|
90
108
|
end
|
91
109
|
|
110
|
+
def f
|
111
|
+
cr.f
|
112
|
+
end
|
113
|
+
|
92
114
|
# Read ahead into queue.
|
93
115
|
def read_ahead
|
94
116
|
# n = 0
|
95
117
|
begin
|
96
|
-
f_pos = 0
|
97
118
|
until f.eof?
|
98
|
-
chunk =
|
99
|
-
|
100
|
-
|
101
|
-
# n += 1
|
102
|
-
# if (n % 100) == 0
|
103
|
-
# $stderr.puts "buffer size: #{@buffer.size}"
|
104
|
-
# end
|
119
|
+
chunk = cr.read_chunk
|
120
|
+
c_pos = cr.pos
|
121
|
+
@buffer.put([c_pos, chunk])
|
105
122
|
end
|
106
|
-
@
|
123
|
+
@buffer.put(:eof)
|
124
|
+
# @eof_reached = true
|
107
125
|
rescue Exception
|
108
126
|
@read_ahead_ex = $!
|
109
|
-
|
127
|
+
LOG.error $!
|
128
|
+
@buffer.put($!)
|
110
129
|
end
|
111
130
|
end
|
112
131
|
|
113
|
-
# (see ChunkReader#read_chunk)
|
114
132
|
def read_chunk
|
115
|
-
|
116
|
-
|
133
|
+
if ! @first_seq_read
|
134
|
+
# this is the first read_chunk call to read the header
|
135
|
+
# not necessarily indicative of sequential access
|
136
|
+
@first_seq_read = true
|
137
|
+
chunk = cr.read_chunk
|
138
|
+
@pos = cr.pos
|
139
|
+
return chunk
|
140
|
+
elsif @read_ahead_ex
|
141
|
+
raise @read_ahead_ex
|
142
|
+
elsif @eof_reached
|
117
143
|
return nil
|
118
144
|
else
|
119
|
-
|
120
|
-
|
121
|
-
|
145
|
+
start_read_ahead if @read_thread.nil?
|
146
|
+
e = @buffer.take
|
147
|
+
case
|
148
|
+
when e == :eof
|
149
|
+
@eof_reached = nil
|
150
|
+
return nil
|
151
|
+
when e.is_a?(Exception)
|
152
|
+
raise e
|
153
|
+
else
|
154
|
+
c_pos, chunk = e
|
155
|
+
@pos = c_pos
|
156
|
+
return chunk
|
157
|
+
end
|
122
158
|
end
|
123
159
|
end
|
124
160
|
|
161
|
+
def read_chunk_at(*args)
|
162
|
+
cr.read_chunk_at(*args)
|
163
|
+
end
|
125
164
|
end
|
126
165
|
|
127
166
|
# MAF parsing code useful for sequential and random-access parsing.
|
@@ -385,8 +424,7 @@ module Bio
|
|
385
424
|
@f = fd
|
386
425
|
@parser = parser
|
387
426
|
@opts = parser.opts
|
388
|
-
|
389
|
-
@cr = reader.new(@f, chunk_size)
|
427
|
+
@cr = parser.base_reader.new(@f, chunk_size)
|
390
428
|
@last_block_pos = -1
|
391
429
|
end
|
392
430
|
|
@@ -413,6 +451,7 @@ module Bio
|
|
413
451
|
# @return [Array<Block>]
|
414
452
|
def fetch_blocks(offset, len, block_offsets)
|
415
453
|
if block_given?
|
454
|
+
LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
|
416
455
|
start_chunk_read_if_needed(offset, len)
|
417
456
|
# read chunks until we have the entire merged set of
|
418
457
|
# blocks ready to parse
|
@@ -420,6 +459,13 @@ module Bio
|
|
420
459
|
append_chunks_to(len)
|
421
460
|
# parse the blocks
|
422
461
|
block_offsets.each do |expected_offset|
|
462
|
+
# skip ahead, in case there is a gap resulting from a
|
463
|
+
# block that is not being parsed
|
464
|
+
rel_offset = expected_offset - offset
|
465
|
+
if s.pos < rel_offset
|
466
|
+
s.pos = rel_offset
|
467
|
+
end
|
468
|
+
# now actually parse the block data
|
423
469
|
block = _parse_block
|
424
470
|
parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
|
425
471
|
parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
|
@@ -444,7 +490,6 @@ module Bio
|
|
444
490
|
end
|
445
491
|
|
446
492
|
def append_chunks_to(len)
|
447
|
-
# XXX: need to rethink this for BGZF; prefetching ChunkReader
|
448
493
|
while s.string.size < len
|
449
494
|
s.string << cr.read_chunk()
|
450
495
|
end
|
@@ -463,8 +508,6 @@ module Bio
|
|
463
508
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
464
509
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
465
510
|
# random access
|
466
|
-
# * `:chunk_reader`: use the specified class to read
|
467
|
-
# chunks. (Only useful with {ThreadedChunkReader}).
|
468
511
|
# * `:threads`: number of threads to use for parallel
|
469
512
|
# parsing. Only useful under JRuby.
|
470
513
|
# @api public
|
@@ -482,6 +525,9 @@ module Bio
|
|
482
525
|
attr_reader :s
|
483
526
|
# @return [ChunkReader] ChunkReader.
|
484
527
|
attr_reader :cr
|
528
|
+
# @return [Class] ChunkReader class to use for random access
|
529
|
+
# @see ParseContext
|
530
|
+
attr_reader :base_reader
|
485
531
|
# @return [Boolean] whether EOF has been reached.
|
486
532
|
attr_reader :at_end
|
487
533
|
# @return [Hash] parser options.
|
@@ -490,6 +536,8 @@ module Bio
|
|
490
536
|
attr_reader :chunk_start
|
491
537
|
# @return [Integer] offset of the last block start in this chunk.
|
492
538
|
attr_reader :last_block_pos
|
539
|
+
# @return [Symbol] compression method used for this file, or nil
|
540
|
+
attr_reader :compression
|
493
541
|
|
494
542
|
# @api private
|
495
543
|
attr_accessor :parse_extended
|
@@ -515,10 +563,29 @@ module Bio
|
|
515
563
|
@parse_extended = opts[:parse_extended] || false
|
516
564
|
@parse_empty = opts[:parse_empty] || false
|
517
565
|
@chunk_start = 0
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
566
|
+
if file_spec.respond_to? :flush
|
567
|
+
# guess what, Pathnames respond to :read...
|
568
|
+
@f = file_spec
|
569
|
+
@file_spec = @f.path if @f.respond_to?(:path)
|
570
|
+
# TODO: gzip?
|
571
|
+
else
|
572
|
+
@file_spec = file_spec
|
573
|
+
if file_spec.to_s.end_with?(".maf.gz")
|
574
|
+
@f = IO.popen("gzip -dc #{file_spec}")
|
575
|
+
else
|
576
|
+
@f = File.open(file_spec)
|
577
|
+
end
|
578
|
+
end
|
579
|
+
if @file_spec.to_s =~ /\.bgzf?$/
|
580
|
+
@base_reader = BGZFChunkReader
|
581
|
+
@compression = :bgzf
|
582
|
+
else
|
583
|
+
@base_reader = ChunkReader
|
584
|
+
end
|
585
|
+
@cr = base_reader.new(@f, chunk_size)
|
586
|
+
if RUBY_PLATFORM == 'java'
|
587
|
+
@cr = ThreadedChunkReaderWrapper.new(@cr)
|
588
|
+
end
|
522
589
|
@s = StringScanner.new(cr.read_chunk())
|
523
590
|
set_last_block_pos!
|
524
591
|
@at_end = false
|
@@ -536,7 +603,11 @@ module Bio
|
|
536
603
|
# @api private
|
537
604
|
def context(chunk_size)
|
538
605
|
# IO#dup calls dup(2) internally, but seems broken on JRuby...
|
539
|
-
|
606
|
+
if file_spec
|
607
|
+
fd = File.open(file_spec)
|
608
|
+
else
|
609
|
+
fd = f.dup
|
610
|
+
end
|
540
611
|
ParseContext.new(fd, chunk_size, self)
|
541
612
|
end
|
542
613
|
|
@@ -679,6 +750,15 @@ module Bio
|
|
679
750
|
#
|
680
751
|
# Returns `[offset, size, [offset1, offset2, ...]]` tuples.
|
681
752
|
def merge_fetch_list(orig_fl)
|
753
|
+
case compression
|
754
|
+
when nil
|
755
|
+
_merge_fetch_list(orig_fl)
|
756
|
+
when :bgzf
|
757
|
+
_merge_bgzf_fetch_list(orig_fl)
|
758
|
+
end
|
759
|
+
end
|
760
|
+
|
761
|
+
def _merge_fetch_list(orig_fl)
|
682
762
|
fl = orig_fl.dup
|
683
763
|
r = []
|
684
764
|
until fl.empty? do
|
@@ -698,6 +778,22 @@ module Bio
|
|
698
778
|
return r
|
699
779
|
end
|
700
780
|
|
781
|
+
# Build a merged fetch list in a BGZF-aware way. This will
|
782
|
+
# group together all MAF blocks from a single BGZF block. These
|
783
|
+
# MAF blocks may not be consecutive.
|
784
|
+
def _merge_bgzf_fetch_list(orig_fl)
|
785
|
+
block_e = orig_fl.chunk { |entry|
|
786
|
+
Bio::BGZF::vo_block_offset(entry[0])
|
787
|
+
}
|
788
|
+
block_e.collect do |bgzf_block, fl|
|
789
|
+
# text size to read from disk, from the start of the first
|
790
|
+
# block to the end of the last block
|
791
|
+
text_size = fl.last[0] + fl.last[1] - fl.first[0]
|
792
|
+
offsets = fl.collect { |e| e[0] }
|
793
|
+
[fl.first[0], text_size, offsets]
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
701
797
|
# Parse the header of the MAF file.
|
702
798
|
def _parse_header
|
703
799
|
parse_error("not a MAF file") unless s.scan(/##maf\s*/)
|