bio-maf 0.3.2-java → 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/README.md +34 -3
- data/bin/maf_bgzip +56 -0
- data/bin/maf_index +0 -6
- data/bio-maf.gemspec +3 -2
- data/features/bgzf.feature +62 -0
- data/features/maf-indexing.feature +10 -0
- data/features/maf-querying.feature +9 -0
- data/features/step_definitions/convert_steps.rb +1 -1
- data/features/support/env.rb +2 -0
- data/lib/bio/maf/index.rb +35 -13
- data/lib/bio/maf/maf.rb +3 -1
- data/lib/bio/maf/parser.rb +135 -39
- data/lib/bio/maf/writer.rb +4 -4
- data/man/maf_bgzip.1 +101 -0
- data/man/maf_bgzip.1.ronn +85 -0
- data/man/maf_extract.1 +5 -2
- data/man/maf_extract.1.ronn +4 -1
- data/man/maf_index.1 +3 -3
- data/man/maf_index.1.ronn +3 -2
- data/man/maf_tile.1 +5 -2
- data/man/maf_tile.1.ronn +4 -1
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +7 -0
- data/spec/bio/maf/parser_spec.rb +13 -0
- data/spec/spec_helper.rb +2 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8.chrM.maf +2421 -0
- data/test/data/mm8.chrM.maf.bgz +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf.gz +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +30 -2
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -8,9 +8,9 @@ support for the
|
|
8
8
|
(MAF), used in bioinformatics to store whole-genome sets of multiple
|
9
9
|
sequence alignments.
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
MAF files.
|
11
|
+
This library provides indexed and sequential access to MAF data, as
|
12
|
+
well as performing various manipulations on it and writing modified
|
13
|
+
MAF files.
|
14
14
|
|
15
15
|
For more information, see the
|
16
16
|
[project wiki](https://github.com/csw/bioruby-maf/wiki).
|
@@ -94,6 +94,20 @@ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
|
|
94
94
|
idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
|
95
95
|
```
|
96
96
|
|
97
|
+
### Compress and index a MAF file
|
98
|
+
|
99
|
+
This library fully supports [BGZF][]-compressed MAF files, which
|
100
|
+
combine gzip compression with blocking for efficient random
|
101
|
+
access. These can be generated with blocking optimized for MAF access
|
102
|
+
using the included
|
103
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
|
104
|
+
tool. This writes BGZF-compressed MAF files and optionally indexes
|
105
|
+
them as well:
|
106
|
+
|
107
|
+
$ maf_bgzip --dir /tmp --index --all test/data/mm8.chrM.maf
|
108
|
+
|
109
|
+
This is the easiest way to prepare MAF files for use with this library.
|
110
|
+
|
97
111
|
### Extract blocks from an indexed MAF file, by genomic interval
|
98
112
|
|
99
113
|
Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
|
@@ -352,10 +366,27 @@ access.tile(interval) do |tiler|
|
|
352
366
|
end
|
353
367
|
```
|
354
368
|
|
369
|
+
### Compression
|
370
|
+
|
371
|
+
MAF files can optionally be compressed in the [BGZF][] format defined
|
372
|
+
in the [SAM specification][]. This is best done with
|
373
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html),
|
374
|
+
but files compressed with the `bgzip(1)` tool from samtools will also
|
375
|
+
work, though less efficiently.
|
376
|
+
|
377
|
+
[BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
|
378
|
+
[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
|
379
|
+
|
380
|
+
MAF files compressed with plain gzip will be decompressed on the fly,
|
381
|
+
but random access to these files will not be possible. However,
|
382
|
+
gzipped MAF files are suitable as input to
|
383
|
+
[`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html).
|
384
|
+
|
355
385
|
### Command line tools
|
356
386
|
|
357
387
|
Man pages for command line tools:
|
358
388
|
|
389
|
+
* [`maf_bgzip(1)`](http://csw.github.com/bioruby-maf/man/maf_bgzip.1.html)
|
359
390
|
* [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
|
360
391
|
* [`maf_extract(1)`](http://csw.github.com/bioruby-maf/man/maf_extract.1.html)
|
361
392
|
* [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
|
data/bin/maf_bgzip
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
require 'bio-maf'
|
6
|
+
require 'bio-bgzf'
|
7
|
+
|
8
|
+
$options = OpenStruct.new
|
9
|
+
$options.dir = '.'
|
10
|
+
$options.ref_only = true
|
11
|
+
|
12
|
+
op = OptionParser.new do |opts|
|
13
|
+
opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Options:"
|
16
|
+
opts.on("-d", "--dir DIR",
|
17
|
+
"Directory to write compressed MAF to",
|
18
|
+
"(default is current directory)") do |dir|
|
19
|
+
$options.dir = dir
|
20
|
+
end
|
21
|
+
opts.on("-i", "--index", "Index MAF files after writing") do
|
22
|
+
$options.index = true
|
23
|
+
end
|
24
|
+
opts.on("-a", "--all",
|
25
|
+
"Index all sequences, not just reference seq",
|
26
|
+
"(has no effect without --index)") do
|
27
|
+
$options.ref_only = false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
op.parse!(ARGV)
|
32
|
+
|
33
|
+
until ARGV.empty?
|
34
|
+
maf_path = ARGV.shift
|
35
|
+
maf_base = File.basename(maf_path)
|
36
|
+
base = maf_base.gsub(/\.maf.*/, '')
|
37
|
+
bgz_path = "#{$options.dir}/#{base}.maf.bgz"
|
38
|
+
p = Bio::MAF::Parser.new(maf_path,
|
39
|
+
:parse_extended => true,
|
40
|
+
:parse_empty => true)
|
41
|
+
File.open(bgz_path, 'w') do |out_f|
|
42
|
+
Bio::BGZF::Writer.new(out_f) do |bgz_w|
|
43
|
+
maf_w = Bio::MAF::Writer.new(bgz_w)
|
44
|
+
maf_w.write_header(p.header)
|
45
|
+
p.each_block do |block|
|
46
|
+
maf_w.write_block(block)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
p.close
|
51
|
+
if $options.index
|
52
|
+
p2 = Bio::MAF::Parser.new(bgz_path)
|
53
|
+
idx_path = "#{$options.dir}/#{base}.kct"
|
54
|
+
Bio::MAF::KyotoIndex.build(p2, idx_path, $options.ref_only)
|
55
|
+
end
|
56
|
+
end
|
data/bin/maf_index
CHANGED
@@ -14,15 +14,9 @@ PRINTERS = {
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
16
|
$options.ref_only = true
|
17
|
-
$options.reader = if RUBY_PLATFORM == 'java'
|
18
|
-
Bio::MAF::ThreadedChunkReader
|
19
|
-
else
|
20
|
-
Bio::MAF::ChunkReader
|
21
|
-
end
|
22
17
|
|
23
18
|
def build_index(maf, index)
|
24
19
|
parser = Bio::MAF::Parser.new(maf,
|
25
|
-
:chunk_reader => $options.reader,
|
26
20
|
:parse_extended => false)
|
27
21
|
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
28
22
|
idx.close
|
data/bio-maf.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "1.0.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-
|
9
|
+
s.date = "2012-08-02"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
12
|
s.extra_rdoc_files = [
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
end
|
33
33
|
|
34
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
35
|
+
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
|
35
36
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
36
37
|
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
37
38
|
if RUBY_PLATFORM == 'java'
|
@@ -0,0 +1,62 @@
|
|
1
|
+
Feature: BGZF compression
|
2
|
+
Because MAF files are large
|
3
|
+
We need random access
|
4
|
+
But we would also like to compress them
|
5
|
+
Yet common compression formats don't facilitate random access
|
6
|
+
So we use BGZF compression to support random access
|
7
|
+
To 64 KB chunks
|
8
|
+
|
9
|
+
@no_jruby
|
10
|
+
Scenario: Compress a MAF file
|
11
|
+
Given test files:
|
12
|
+
| mm8_chr7_tiny.maf |
|
13
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf`
|
14
|
+
Then it should pass with:
|
15
|
+
"""
|
16
|
+
"""
|
17
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
18
|
+
|
19
|
+
@no_jruby
|
20
|
+
Scenario: Compress and index a MAF file
|
21
|
+
Given test files:
|
22
|
+
| mm8_chr7_tiny.maf |
|
23
|
+
When I run `maf_bgzip -i mm8_chr7_tiny.maf`
|
24
|
+
Then it should pass with:
|
25
|
+
"""
|
26
|
+
"""
|
27
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
28
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
29
|
+
|
30
|
+
@no_jruby
|
31
|
+
Scenario: Compress a gzipped MAF file
|
32
|
+
Given test files:
|
33
|
+
| mm8_chr7_tiny.maf.gz |
|
34
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf.gz`
|
35
|
+
Then it should pass with:
|
36
|
+
"""
|
37
|
+
"""
|
38
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
39
|
+
|
40
|
+
@no_jruby
|
41
|
+
Scenario: Compress and index a gzipped MAF file
|
42
|
+
Given test files:
|
43
|
+
| mm8_chr7_tiny.maf.gz |
|
44
|
+
When I run `maf_bgzip -i mm8_chr7_tiny.maf.gz`
|
45
|
+
Then it should pass with:
|
46
|
+
"""
|
47
|
+
"""
|
48
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
49
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
50
|
+
|
51
|
+
@no_jruby
|
52
|
+
Scenario: Compress multiple MAF files
|
53
|
+
Given test files:
|
54
|
+
| mm8_chr7_tiny.maf |
|
55
|
+
| mm8.chrM.maf |
|
56
|
+
When I run `maf_bgzip mm8_chr7_tiny.maf mm8.chrM.maf`
|
57
|
+
Then it should pass with:
|
58
|
+
"""
|
59
|
+
"""
|
60
|
+
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
61
|
+
And a file named "mm8.chrM.maf.bgz" should exist
|
62
|
+
|
@@ -49,6 +49,16 @@ Feature: Indexed access to MAF files
|
|
49
49
|
"""
|
50
50
|
And a file named "mm8_chr7_tiny.kct" should exist
|
51
51
|
|
52
|
+
@no_jruby
|
53
|
+
Scenario: Build MAF index on BGZF file with CLI tool
|
54
|
+
Given test files:
|
55
|
+
| mm8.chrM.maf.bgz |
|
56
|
+
When I run `maf_index mm8.chrM.maf.bgz mm8.chrM.kct`
|
57
|
+
Then it should pass with:
|
58
|
+
"""
|
59
|
+
"""
|
60
|
+
And a file named "mm8.chrM.kct" should exist
|
61
|
+
|
52
62
|
@no_jruby
|
53
63
|
Scenario: Build MAF index on all sequences with CLI tool
|
54
64
|
Given test files:
|
@@ -73,3 +73,12 @@ Feature: Filter results from MAF files
|
|
73
73
|
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
74
74
|
Then 3 blocks are obtained
|
75
75
|
|
76
|
+
@no_jruby
|
77
|
+
Scenario: Parse blocks from a BGZF-compressed file
|
78
|
+
Given test files:
|
79
|
+
| mm8.chrM.maf |
|
80
|
+
| mm8.chrM.maf.bgz |
|
81
|
+
When I run `maf_extract -m mm8.chrM.maf --interval mm8.chrM:6938-13030 -o m1.maf`
|
82
|
+
And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
|
83
|
+
And I run `diff m1.maf m2.maf`
|
84
|
+
Then the exit status should be 0
|
data/features/support/env.rb
CHANGED
data/lib/bio/maf/index.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'kyotocabinet'
|
2
2
|
require 'jruby/profiler' if RUBY_PLATFORM == 'java'
|
3
|
+
require 'bio-bgzf'
|
3
4
|
|
4
5
|
#require 'bio-ucsc-api'
|
5
6
|
require 'bio-genomic-interval'
|
@@ -189,18 +190,20 @@ module Bio
|
|
189
190
|
@indices = {}
|
190
191
|
@maf_by_chrom = {}
|
191
192
|
if options[:dir]
|
192
|
-
|
193
|
-
@maf_files = Dir.glob("#{@dir}/*.maf")
|
193
|
+
scan_dir(options[:dir])
|
194
194
|
elsif options[:maf]
|
195
|
-
@maf_files = [options[:maf]]
|
196
195
|
if options[:index]
|
197
196
|
register_index(KyotoIndex.open(options[:index]),
|
198
197
|
options[:maf])
|
198
|
+
else
|
199
|
+
idx = find_index_file(options[:maf])
|
200
|
+
if idx
|
201
|
+
register_index(KyotoIndex.open(idx), options[:maf])
|
202
|
+
end
|
199
203
|
end
|
200
204
|
else
|
201
205
|
raise "Must specify :dir or :maf!"
|
202
206
|
end
|
203
|
-
scan_indices!
|
204
207
|
if options[:maf] && @indices.empty?
|
205
208
|
# MAF file explicitly given but no index
|
206
209
|
# build a temporary one
|
@@ -215,23 +218,27 @@ module Bio
|
|
215
218
|
|
216
219
|
# @api private
|
217
220
|
def find_index_file(maf)
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
+
dir = File.dirname(maf)
|
222
|
+
base = File.basename(maf)
|
223
|
+
noext = base.gsub(/\.maf.*/, '')
|
224
|
+
idx = [base, noext].collect { |n| "#{dir}/#{n}.kct" }.find { |path| File.exist? path }
|
221
225
|
end
|
222
226
|
|
223
227
|
# @api private
|
224
228
|
def register_index(index, maf)
|
229
|
+
unless index.maf_file == File.basename(maf)
|
230
|
+
raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
|
231
|
+
end
|
225
232
|
@indices[index.ref_seq] = index
|
226
233
|
@maf_by_chrom[index.ref_seq] = maf
|
227
234
|
end
|
228
235
|
|
229
236
|
# @api private
|
230
|
-
def
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
237
|
+
def scan_dir(dir)
|
238
|
+
Dir.glob("#{dir}/*.kct").each do |index_f|
|
239
|
+
index = KyotoIndex.open(index_f)
|
240
|
+
maf = "#{dir}/#{index.maf_file}"
|
241
|
+
if File.exist? maf
|
235
242
|
register_index(index, maf)
|
236
243
|
end
|
237
244
|
end
|
@@ -262,9 +269,12 @@ module Bio
|
|
262
269
|
class KyotoIndex
|
263
270
|
include KVHelpers
|
264
271
|
|
265
|
-
attr_reader :db, :species, :species_max_id, :ref_only
|
272
|
+
attr_reader :db, :species, :species_max_id, :ref_only, :path
|
273
|
+
attr_reader :maf_file
|
266
274
|
attr_accessor :index_sequences, :ref_seq
|
267
275
|
|
276
|
+
COMPRESSION_KEY = 'bio-maf:compression'
|
277
|
+
FILE_KEY = 'bio-maf:file'
|
268
278
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
269
279
|
FORMAT_VERSION = 2
|
270
280
|
REF_SEQ_KEY = 'bio-maf:reference-sequence'
|
@@ -438,6 +448,7 @@ module Bio
|
|
438
448
|
raise "Could not open DB file!"
|
439
449
|
end
|
440
450
|
if mode == KyotoCabinet::DB::OREADER
|
451
|
+
@maf_file = db[FILE_KEY]
|
441
452
|
self.ref_seq = db[REF_SEQ_KEY]
|
442
453
|
load_index_sequences
|
443
454
|
load_species
|
@@ -450,6 +461,7 @@ module Bio
|
|
450
461
|
end
|
451
462
|
|
452
463
|
def dump(stream=$stdout)
|
464
|
+
bgzf = (db[COMPRESSION_KEY] == 'bgzf')
|
453
465
|
stream.puts "KyotoIndex dump: #{@path}"
|
454
466
|
stream.puts
|
455
467
|
if db.count == 0
|
@@ -474,6 +486,11 @@ module Bio
|
|
474
486
|
offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
|
475
487
|
stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
|
476
488
|
stream.puts " offset #{offset}, length #{len}"
|
489
|
+
if bgzf
|
490
|
+
block = Bio::BGZF.vo_block_offset(offset)
|
491
|
+
data = Bio::BGZF.vo_data_offset(offset)
|
492
|
+
stream.puts " BGZF block offset #{block}, data offset #{data}"
|
493
|
+
end
|
477
494
|
stream.puts " text size: #{text_size}"
|
478
495
|
stream.puts " sequences in block: #{n_seq}"
|
479
496
|
stream.printf(" species vector: %016x\n", species_vec)
|
@@ -660,6 +677,11 @@ module Bio
|
|
660
677
|
end
|
661
678
|
|
662
679
|
def build(parser, ref_only=true)
|
680
|
+
db[FILE_KEY] = File.basename(parser.file_spec)
|
681
|
+
@maf_file = db[FILE_KEY]
|
682
|
+
if parser.compression
|
683
|
+
db[COMPRESSION_KEY] = parser.compression.to_s
|
684
|
+
end
|
663
685
|
first_block = parser.parse_block
|
664
686
|
self.ref_seq = first_block.sequences.first.source
|
665
687
|
@ref_only = ref_only
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -103,8 +103,9 @@ module Bio
|
|
103
103
|
|
104
104
|
GAP = /-+/
|
105
105
|
|
106
|
-
#
|
106
|
+
# Find gaps present in all sequences. These would generally
|
107
107
|
# occur when some sequences have been filtered out.
|
108
|
+
#
|
108
109
|
# @see #remove_gaps!
|
109
110
|
# @see Parser#sequence_filter
|
110
111
|
def find_gaps
|
@@ -126,6 +127,7 @@ module Bio
|
|
126
127
|
|
127
128
|
# Remove gaps present in all sequences. These would generally
|
128
129
|
# occur when some sequences have been filtered out.
|
130
|
+
#
|
129
131
|
# @see #find_gaps
|
130
132
|
# @see Parser#sequence_filter
|
131
133
|
def remove_gaps!
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'strscan'
|
2
2
|
require 'java' if RUBY_PLATFORM == 'java'
|
3
|
+
require 'bio-bgzf'
|
3
4
|
|
4
5
|
# @api public
|
5
6
|
module Bio
|
@@ -56,8 +57,7 @@ module Bio
|
|
56
57
|
|
57
58
|
# Reads a chunk of the file.
|
58
59
|
#
|
59
|
-
# Currently always reads size_hint bytes
|
60
|
-
# with BGZF support.
|
60
|
+
# Currently always reads size_hint bytes.
|
61
61
|
#
|
62
62
|
# @param [Integer] offset file offset to read from.
|
63
63
|
# @param [Integer] size_hint desired size of chunk.
|
@@ -70,58 +70,97 @@ module Bio
|
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
# parsing.
|
76
|
-
#
|
77
|
-
# Only beneficial on JRuby.
|
78
|
-
class ThreadedChunkReader < ChunkReader
|
73
|
+
class BGZFChunkReader
|
74
|
+
attr_reader :f, :r
|
79
75
|
|
80
|
-
def initialize(f,
|
81
|
-
|
82
|
-
@
|
83
|
-
@eof_reached = false
|
84
|
-
start_read_ahead
|
76
|
+
def initialize(f, _chunk_size)
|
77
|
+
@f = f
|
78
|
+
@r = Bio::BGZF::Reader.new(f)
|
85
79
|
end
|
86
80
|
|
81
|
+
def pos
|
82
|
+
r.tell
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_chunk
|
86
|
+
r.read_block
|
87
|
+
end
|
88
|
+
|
89
|
+
def read_chunk_at(vo, _size)
|
90
|
+
r.read_block_at(vo)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class ThreadedChunkReaderWrapper
|
95
|
+
|
96
|
+
attr_reader :cr, :pos
|
97
|
+
|
98
|
+
def initialize(cr, buffer_size=64)
|
99
|
+
@cr = cr
|
100
|
+
@buffer = java.util.concurrent.LinkedBlockingQueue.new(buffer_size)
|
101
|
+
@eof_reached = false
|
102
|
+
@first_seq_read = false
|
103
|
+
end
|
104
|
+
|
87
105
|
# Spawn a read-ahead thread. Called from {#initialize}.
|
88
106
|
def start_read_ahead
|
89
107
|
@read_thread = Thread.new { read_ahead }
|
90
108
|
end
|
91
109
|
|
110
|
+
def f
|
111
|
+
cr.f
|
112
|
+
end
|
113
|
+
|
92
114
|
# Read ahead into queue.
|
93
115
|
def read_ahead
|
94
116
|
# n = 0
|
95
117
|
begin
|
96
|
-
f_pos = 0
|
97
118
|
until f.eof?
|
98
|
-
chunk =
|
99
|
-
|
100
|
-
|
101
|
-
# n += 1
|
102
|
-
# if (n % 100) == 0
|
103
|
-
# $stderr.puts "buffer size: #{@buffer.size}"
|
104
|
-
# end
|
119
|
+
chunk = cr.read_chunk
|
120
|
+
c_pos = cr.pos
|
121
|
+
@buffer.put([c_pos, chunk])
|
105
122
|
end
|
106
|
-
@
|
123
|
+
@buffer.put(:eof)
|
124
|
+
# @eof_reached = true
|
107
125
|
rescue Exception
|
108
126
|
@read_ahead_ex = $!
|
109
|
-
|
127
|
+
LOG.error $!
|
128
|
+
@buffer.put($!)
|
110
129
|
end
|
111
130
|
end
|
112
131
|
|
113
|
-
# (see ChunkReader#read_chunk)
|
114
132
|
def read_chunk
|
115
|
-
|
116
|
-
|
133
|
+
if ! @first_seq_read
|
134
|
+
# this is the first read_chunk call to read the header
|
135
|
+
# not necessarily indicative of sequential access
|
136
|
+
@first_seq_read = true
|
137
|
+
chunk = cr.read_chunk
|
138
|
+
@pos = cr.pos
|
139
|
+
return chunk
|
140
|
+
elsif @read_ahead_ex
|
141
|
+
raise @read_ahead_ex
|
142
|
+
elsif @eof_reached
|
117
143
|
return nil
|
118
144
|
else
|
119
|
-
|
120
|
-
|
121
|
-
|
145
|
+
start_read_ahead if @read_thread.nil?
|
146
|
+
e = @buffer.take
|
147
|
+
case
|
148
|
+
when e == :eof
|
149
|
+
@eof_reached = nil
|
150
|
+
return nil
|
151
|
+
when e.is_a?(Exception)
|
152
|
+
raise e
|
153
|
+
else
|
154
|
+
c_pos, chunk = e
|
155
|
+
@pos = c_pos
|
156
|
+
return chunk
|
157
|
+
end
|
122
158
|
end
|
123
159
|
end
|
124
160
|
|
161
|
+
def read_chunk_at(*args)
|
162
|
+
cr.read_chunk_at(*args)
|
163
|
+
end
|
125
164
|
end
|
126
165
|
|
127
166
|
# MAF parsing code useful for sequential and random-access parsing.
|
@@ -385,8 +424,7 @@ module Bio
|
|
385
424
|
@f = fd
|
386
425
|
@parser = parser
|
387
426
|
@opts = parser.opts
|
388
|
-
|
389
|
-
@cr = reader.new(@f, chunk_size)
|
427
|
+
@cr = parser.base_reader.new(@f, chunk_size)
|
390
428
|
@last_block_pos = -1
|
391
429
|
end
|
392
430
|
|
@@ -413,6 +451,7 @@ module Bio
|
|
413
451
|
# @return [Array<Block>]
|
414
452
|
def fetch_blocks(offset, len, block_offsets)
|
415
453
|
if block_given?
|
454
|
+
LOG.debug { "fetching blocks from #{offset} (length #{len}): #{block_offsets.inspect}" }
|
416
455
|
start_chunk_read_if_needed(offset, len)
|
417
456
|
# read chunks until we have the entire merged set of
|
418
457
|
# blocks ready to parse
|
@@ -420,6 +459,13 @@ module Bio
|
|
420
459
|
append_chunks_to(len)
|
421
460
|
# parse the blocks
|
422
461
|
block_offsets.each do |expected_offset|
|
462
|
+
# skip ahead, in case there is a gap resulting from a
|
463
|
+
# block that is not being parsed
|
464
|
+
rel_offset = expected_offset - offset
|
465
|
+
if s.pos < rel_offset
|
466
|
+
s.pos = rel_offset
|
467
|
+
end
|
468
|
+
# now actually parse the block data
|
423
469
|
block = _parse_block
|
424
470
|
parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
|
425
471
|
parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
|
@@ -444,7 +490,6 @@ module Bio
|
|
444
490
|
end
|
445
491
|
|
446
492
|
def append_chunks_to(len)
|
447
|
-
# XXX: need to rethink this for BGZF; prefetching ChunkReader
|
448
493
|
while s.string.size < len
|
449
494
|
s.string << cr.read_chunk()
|
450
495
|
end
|
@@ -463,8 +508,6 @@ module Bio
|
|
463
508
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
464
509
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
465
510
|
# random access
|
466
|
-
# * `:chunk_reader`: use the specified class to read
|
467
|
-
# chunks. (Only useful with {ThreadedChunkReader}).
|
468
511
|
# * `:threads`: number of threads to use for parallel
|
469
512
|
# parsing. Only useful under JRuby.
|
470
513
|
# @api public
|
@@ -482,6 +525,9 @@ module Bio
|
|
482
525
|
attr_reader :s
|
483
526
|
# @return [ChunkReader] ChunkReader.
|
484
527
|
attr_reader :cr
|
528
|
+
# @return [Class] ChunkReader class to use for random access
|
529
|
+
# @see ParseContext
|
530
|
+
attr_reader :base_reader
|
485
531
|
# @return [Boolean] whether EOF has been reached.
|
486
532
|
attr_reader :at_end
|
487
533
|
# @return [Hash] parser options.
|
@@ -490,6 +536,8 @@ module Bio
|
|
490
536
|
attr_reader :chunk_start
|
491
537
|
# @return [Integer] offset of the last block start in this chunk.
|
492
538
|
attr_reader :last_block_pos
|
539
|
+
# @return [Symbol] compression method used for this file, or nil
|
540
|
+
attr_reader :compression
|
493
541
|
|
494
542
|
# @api private
|
495
543
|
attr_accessor :parse_extended
|
@@ -515,10 +563,29 @@ module Bio
|
|
515
563
|
@parse_extended = opts[:parse_extended] || false
|
516
564
|
@parse_empty = opts[:parse_empty] || false
|
517
565
|
@chunk_start = 0
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
566
|
+
if file_spec.respond_to? :flush
|
567
|
+
# guess what, Pathnames respond to :read...
|
568
|
+
@f = file_spec
|
569
|
+
@file_spec = @f.path if @f.respond_to?(:path)
|
570
|
+
# TODO: gzip?
|
571
|
+
else
|
572
|
+
@file_spec = file_spec
|
573
|
+
if file_spec.to_s.end_with?(".maf.gz")
|
574
|
+
@f = IO.popen("gzip -dc #{file_spec}")
|
575
|
+
else
|
576
|
+
@f = File.open(file_spec)
|
577
|
+
end
|
578
|
+
end
|
579
|
+
if @file_spec.to_s =~ /\.bgzf?$/
|
580
|
+
@base_reader = BGZFChunkReader
|
581
|
+
@compression = :bgzf
|
582
|
+
else
|
583
|
+
@base_reader = ChunkReader
|
584
|
+
end
|
585
|
+
@cr = base_reader.new(@f, chunk_size)
|
586
|
+
if RUBY_PLATFORM == 'java'
|
587
|
+
@cr = ThreadedChunkReaderWrapper.new(@cr)
|
588
|
+
end
|
522
589
|
@s = StringScanner.new(cr.read_chunk())
|
523
590
|
set_last_block_pos!
|
524
591
|
@at_end = false
|
@@ -536,7 +603,11 @@ module Bio
|
|
536
603
|
# @api private
|
537
604
|
def context(chunk_size)
|
538
605
|
# IO#dup calls dup(2) internally, but seems broken on JRuby...
|
539
|
-
|
606
|
+
if file_spec
|
607
|
+
fd = File.open(file_spec)
|
608
|
+
else
|
609
|
+
fd = f.dup
|
610
|
+
end
|
540
611
|
ParseContext.new(fd, chunk_size, self)
|
541
612
|
end
|
542
613
|
|
@@ -679,6 +750,15 @@ module Bio
|
|
679
750
|
#
|
680
751
|
# Returns `[offset, size, [offset1, offset2, ...]]` tuples.
|
681
752
|
def merge_fetch_list(orig_fl)
|
753
|
+
case compression
|
754
|
+
when nil
|
755
|
+
_merge_fetch_list(orig_fl)
|
756
|
+
when :bgzf
|
757
|
+
_merge_bgzf_fetch_list(orig_fl)
|
758
|
+
end
|
759
|
+
end
|
760
|
+
|
761
|
+
def _merge_fetch_list(orig_fl)
|
682
762
|
fl = orig_fl.dup
|
683
763
|
r = []
|
684
764
|
until fl.empty? do
|
@@ -698,6 +778,22 @@ module Bio
|
|
698
778
|
return r
|
699
779
|
end
|
700
780
|
|
781
|
+
# Build a merged fetch list in a BGZF-aware way. This will
|
782
|
+
# group together all MAF blocks from a single BGZF block. These
|
783
|
+
# MAF blocks may not be consecutive.
|
784
|
+
def _merge_bgzf_fetch_list(orig_fl)
|
785
|
+
block_e = orig_fl.chunk { |entry|
|
786
|
+
Bio::BGZF::vo_block_offset(entry[0])
|
787
|
+
}
|
788
|
+
block_e.collect do |bgzf_block, fl|
|
789
|
+
# text size to read from disk, from the start of the first
|
790
|
+
# block to the end of the last block
|
791
|
+
text_size = fl.last[0] + fl.last[1] - fl.first[0]
|
792
|
+
offsets = fl.collect { |e| e[0] }
|
793
|
+
[fl.first[0], text_size, offsets]
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
701
797
|
# Parse the header of the MAF file.
|
702
798
|
def _parse_header
|
703
799
|
parse_error("not a MAF file") unless s.scan(/##maf\s*/)
|