bio-maf 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio-maf.rb +12 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +182 -0
@@ -0,0 +1,888 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require 'java' if RUBY_PLATFORM == 'java'
|
3
|
+
|
4
|
+
# @api public
|
5
|
+
module Bio
|
6
|
+
# @api public
|
7
|
+
module MAF
|
8
|
+
|
9
|
+
# @api public
|
10
|
+
class ParseError < Exception; end
|
11
|
+
|
12
|
+
# A MAF header, containing the variable-value pairs from the first
|
13
|
+
# line of the file as well as the alignment parameters.
|
14
|
+
# @api public
|
15
|
+
class Header
|
16
|
+
# Variable-value pairs from the ##maf line
|
17
|
+
# @return [Hash]
|
18
|
+
attr_accessor :vars
|
19
|
+
# Alignment parameters from the MAF header.
|
20
|
+
# @return [Hash]
|
21
|
+
attr_accessor :alignment_params
|
22
|
+
|
23
|
+
def initialize(vars, params)
|
24
|
+
@vars = vars
|
25
|
+
@alignment_params = params
|
26
|
+
end
|
27
|
+
|
28
|
+
# The required version parameter.
|
29
|
+
# @return [String]
|
30
|
+
def version
|
31
|
+
vars[:version]
|
32
|
+
end
|
33
|
+
|
34
|
+
# The optional scoring parameter, if present.
|
35
|
+
# @return [String]
|
36
|
+
def scoring
|
37
|
+
vars[:scoring]
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
# A MAF alignment block.
|
43
|
+
# @api public
|
44
|
+
class Block
|
45
|
+
# Parameters from the 'a' line starting the alignment block.
|
46
|
+
attr_reader :vars
|
47
|
+
# Sequences, one per 's' or 'e' line.
|
48
|
+
# @return [Array<Sequence>]
|
49
|
+
attr_reader :sequences
|
50
|
+
# Offset of the alignment block within the MAF file, in bytes.
|
51
|
+
# @return [Integer]
|
52
|
+
attr_reader :offset
|
53
|
+
# Size of the alignment block within the MAF file, in bytes.
|
54
|
+
# @return [Integer]
|
55
|
+
attr_reader :size
|
56
|
+
|
57
|
+
def initialize(*args)
|
58
|
+
@vars, @sequences, @offset, @size = args
|
59
|
+
end
|
60
|
+
|
61
|
+
def raw_seq(i)
|
62
|
+
sequences.fetch(i)
|
63
|
+
end
|
64
|
+
|
65
|
+
def each_raw_seq
|
66
|
+
sequences.each { |s| yield s }
|
67
|
+
end
|
68
|
+
|
69
|
+
# Text size of the alignment block. This is the number of text
|
70
|
+
# characters in each line of sequence data, including dashes and
|
71
|
+
# other gaps in the sequence.
|
72
|
+
def text_size
|
73
|
+
sequences.first.text.size
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
# A sequence within an alignment block.
|
79
|
+
# @api public
|
80
|
+
class Sequence
|
81
|
+
# @return [String] Source sequence name.
|
82
|
+
attr_reader :source
|
83
|
+
# @return [Integer] Zero-based start position.
|
84
|
+
attr_reader :start
|
85
|
+
# @return [Integer] Size of aligning region in source sequence.
|
86
|
+
attr_reader :size
|
87
|
+
# :+ or :-, indicating which strand the alignment is to.
|
88
|
+
# @return [Symbol]
|
89
|
+
attr_reader :strand
|
90
|
+
# Size of the entire source sequence, not just the aligning
|
91
|
+
# region.
|
92
|
+
# @return [Integer]
|
93
|
+
attr_reader :src_size
|
94
|
+
# Sequence data for the alignment, including insertions.
|
95
|
+
# @return [String]
|
96
|
+
attr_reader :text
|
97
|
+
# Array of raw synteny information from 'i' line.
|
98
|
+
# @return [Array<String>]
|
99
|
+
attr_accessor :i_data
|
100
|
+
# Quality string from 'q' line.
|
101
|
+
# @return [String]
|
102
|
+
attr_accessor :quality
|
103
|
+
alias_method :source_size, :src_size
|
104
|
+
|
105
|
+
def initialize(*args)
|
106
|
+
@source, @start, @size, @strand, @src_size, @text = args
|
107
|
+
end
|
108
|
+
|
109
|
+
# Whether this sequence is empty. Only true for {EmptySequence}
|
110
|
+
# instances from 'e' lines.
|
111
|
+
def empty?
|
112
|
+
false
|
113
|
+
end
|
114
|
+
|
115
|
+
def write_fasta(writer)
|
116
|
+
writer.write("#{source}:#{start}-#{start + size}",
|
117
|
+
text)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# An empty sequence record from an 'e' line.
|
122
|
+
#
|
123
|
+
# This indicates that "there isn't aligning DNA for a species but
|
124
|
+
# that the current block is bridged by a chain that connects
|
125
|
+
# blocks before and after this block" (MAF spec).
|
126
|
+
# @api public
|
127
|
+
class EmptySequence < Sequence
|
128
|
+
attr_reader :status
|
129
|
+
|
130
|
+
def initialize(*args)
|
131
|
+
super(*args[0..4])
|
132
|
+
@status = args[5]
|
133
|
+
end
|
134
|
+
|
135
|
+
def text
|
136
|
+
''
|
137
|
+
end
|
138
|
+
|
139
|
+
def empty?
|
140
|
+
true
|
141
|
+
end
|
142
|
+
|
143
|
+
def write_fasta(writer)
|
144
|
+
raise "empty sequence output not implemented!"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Reads MAF files in chunks.
|
149
|
+
# @api private
|
150
|
+
class ChunkReader
|
151
|
+
# Size, in bytes, of the chunks to read. Must be a power of 2.
|
152
|
+
# @return [Integer]
|
153
|
+
attr_accessor :chunk_size
|
154
|
+
# Current position in the file.
|
155
|
+
# @return [Integer]
|
156
|
+
attr_accessor :pos
|
157
|
+
# {File} from which chunks are read.
|
158
|
+
# @return [File]
|
159
|
+
attr_reader :f
|
160
|
+
def initialize(f, chunk_size)
|
161
|
+
@f = f
|
162
|
+
self.chunk_size = chunk_size
|
163
|
+
@pos = 0
|
164
|
+
end
|
165
|
+
|
166
|
+
def chunk_size=(size)
|
167
|
+
check_chunk_size(size)
|
168
|
+
@chunk_size = size
|
169
|
+
# power of 2 so don't worry about rounding
|
170
|
+
# @chunk_shift = Math.log2(size).to_i
|
171
|
+
end
|
172
|
+
|
173
|
+
def check_chunk_size(size)
|
174
|
+
if size < 1
|
175
|
+
raise "Invalid chunk size: #{size}"
|
176
|
+
end
|
177
|
+
## test whether it is a power of 2
|
178
|
+
## cf. http://bit.ly/JExNc4
|
179
|
+
if size & (size - 1) != 0
|
180
|
+
raise "Invalid chunk size (not a power of 2): #{size}}"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
# Reads the next chunk of the file.
|
185
|
+
# @return [String] Next {#chunk_size} bytes of MAF data.
|
186
|
+
def read_chunk
|
187
|
+
chunk = f.read(@chunk_size)
|
188
|
+
@pos += chunk.bytesize if chunk
|
189
|
+
return chunk
|
190
|
+
end
|
191
|
+
|
192
|
+
# Reads a chunk of the file.
|
193
|
+
#
|
194
|
+
# Currently always reads size_hint bytes but this may change
|
195
|
+
# with BGZF support.
|
196
|
+
#
|
197
|
+
# @param [Integer] offset file offset to read from.
|
198
|
+
# @param [Integer] size_hint desired size of chunk.
|
199
|
+
# @return [String] Chunk of MAF data.
|
200
|
+
def read_chunk_at(offset, size_hint=@chunk_size)
|
201
|
+
f.seek(offset)
|
202
|
+
chunk = f.read(size_hint)
|
203
|
+
@pos = offset + chunk.bytesize
|
204
|
+
return chunk
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# Variant ChunkReader using a read-ahead thread with internal
|
209
|
+
# queue for sequential parsing. Not useful for random-access
|
210
|
+
# parsing.
|
211
|
+
#
|
212
|
+
# Only beneficial on JRuby.
|
213
|
+
class ThreadedChunkReader < ChunkReader
|
214
|
+
|
215
|
+
def initialize(f, chunk_size, buffer_size=64)
|
216
|
+
super(f, chunk_size)
|
217
|
+
@buffer = SizedQueue.new(buffer_size)
|
218
|
+
@eof_reached = false
|
219
|
+
start_read_ahead
|
220
|
+
end
|
221
|
+
|
222
|
+
# Spawn a read-ahead thread. Called from {#initialize}.
|
223
|
+
def start_read_ahead
|
224
|
+
@read_thread = Thread.new { read_ahead }
|
225
|
+
end
|
226
|
+
|
227
|
+
# Read ahead into queue.
|
228
|
+
def read_ahead
|
229
|
+
# n = 0
|
230
|
+
begin
|
231
|
+
f_pos = 0
|
232
|
+
until f.eof?
|
233
|
+
chunk = f.read(@chunk_size)
|
234
|
+
@buffer << [f_pos, chunk]
|
235
|
+
f_pos += chunk.bytesize
|
236
|
+
# n += 1
|
237
|
+
# if (n % 100) == 0
|
238
|
+
# $stderr.puts "buffer size: #{@buffer.size}"
|
239
|
+
# end
|
240
|
+
end
|
241
|
+
@eof_reached = true
|
242
|
+
rescue Exception
|
243
|
+
@read_ahead_ex = $!
|
244
|
+
$stderr.puts "read_ahead aborting: #{$!}"
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
# (see ChunkReader#read_chunk)
|
249
|
+
def read_chunk
|
250
|
+
raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
|
251
|
+
if @eof_reached && @buffer.empty?
|
252
|
+
return nil
|
253
|
+
else
|
254
|
+
c_pos, chunk = @buffer.shift()
|
255
|
+
@pos = c_pos
|
256
|
+
return chunk
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
end
|
261
|
+
|
262
|
+
# MAF parsing code useful for sequential and random-access parsing.
|
263
|
+
module MAFParsing
|
264
|
+
|
265
|
+
BLOCK_START = /^(?=a)/
|
266
|
+
BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
|
267
|
+
EOL_OR_EOF = /\n|\z/
|
268
|
+
|
269
|
+
def set_last_block_pos!
|
270
|
+
@last_block_pos = s.string.rindex(BLOCK_START)
|
271
|
+
end
|
272
|
+
|
273
|
+
## On finding the start of a block:
|
274
|
+
## See whether we are at the last block in the chunk.
|
275
|
+
## If at the last block:
|
276
|
+
## If at EOF: last block.
|
277
|
+
## If not:
|
278
|
+
## Read the next chunk
|
279
|
+
## Find the start of the next block in that chunk
|
280
|
+
## Concatenate the two block fragments
|
281
|
+
## Parse the resulting block
|
282
|
+
## Promote the next scanner, positioned
|
283
|
+
|
284
|
+
# Parse the block at the current position, joining fragments
|
285
|
+
# across chunk boundaries if necessary.
|
286
|
+
#
|
287
|
+
# @return [Block] alignment block
|
288
|
+
# @api public
|
289
|
+
def parse_block
|
290
|
+
return nil if at_end
|
291
|
+
if s.pos != last_block_pos
|
292
|
+
# in non-trailing block
|
293
|
+
parse_block_data
|
294
|
+
else
|
295
|
+
# in trailing block fragment
|
296
|
+
parse_trailing_fragment
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
## Read chunks and accumulate a leading fragment until we
|
301
|
+
## encounter a block start or EOF.
|
302
|
+
def gather_leading_fragment
|
303
|
+
leading_frag = ''
|
304
|
+
while true
|
305
|
+
next_chunk_start = cr.pos
|
306
|
+
next_chunk = cr.read_chunk
|
307
|
+
if next_chunk
|
308
|
+
next_scanner = StringScanner.new(next_chunk)
|
309
|
+
# If this trailing fragment ends with a newline, then an
|
310
|
+
# 'a' at the beginning of the leading fragment is the
|
311
|
+
# start of the next alignment block.
|
312
|
+
if trailing_nl?(leading_frag) || trailing_nl?(s.string)
|
313
|
+
pat = BLOCK_START
|
314
|
+
else
|
315
|
+
pat = /(?:\n(?=a))/
|
316
|
+
end
|
317
|
+
frag = next_scanner.scan_until(pat)
|
318
|
+
if frag
|
319
|
+
# got block start
|
320
|
+
leading_frag << frag
|
321
|
+
break
|
322
|
+
else
|
323
|
+
# no block start in this
|
324
|
+
leading_frag << next_chunk
|
325
|
+
end
|
326
|
+
else
|
327
|
+
# EOF
|
328
|
+
@at_end = true
|
329
|
+
break
|
330
|
+
end
|
331
|
+
end
|
332
|
+
return leading_frag, next_scanner, next_chunk_start
|
333
|
+
end
|
334
|
+
|
335
|
+
# Join the trailing fragment of the current chunk with the
|
336
|
+
# leading fragment of the next chunk and parse the resulting
|
337
|
+
# block.
|
338
|
+
#
|
339
|
+
# @return [Block] the alignment block.
|
340
|
+
|
341
|
+
def parse_trailing_fragment
|
342
|
+
leading_frag, next_scanner, next_chunk_start = gather_leading_fragment
|
343
|
+
# join fragments and parse
|
344
|
+
trailing_frag = s.rest
|
345
|
+
joined_block = trailing_frag + leading_frag
|
346
|
+
@chunk_start = chunk_start + s.pos
|
347
|
+
@s = StringScanner.new(joined_block)
|
348
|
+
begin
|
349
|
+
block = parse_block_data
|
350
|
+
rescue ParseError => pe
|
351
|
+
parse_error "Could not parse joined fragments: #{pe}\nTRAILING: #{trailing_frag}\nLEADING: #{leading_frag}"
|
352
|
+
end
|
353
|
+
# Set up to parse the next block
|
354
|
+
@s = next_scanner
|
355
|
+
@chunk_start = next_chunk_start
|
356
|
+
unless @at_end
|
357
|
+
set_last_block_pos!
|
358
|
+
end
|
359
|
+
return block
|
360
|
+
end
|
361
|
+
|
362
|
+
# Raise a {ParseError}, indicating position within the MAF file
|
363
|
+
# and the chunk as well as the text surrounding the current
|
364
|
+
# scanner position.
|
365
|
+
#
|
366
|
+
# @param [String] msg the error message
|
367
|
+
def parse_error(msg)
|
368
|
+
s_start = [s.pos - 10, 0].max
|
369
|
+
s_end = [s.pos + 10, s.string.length].min
|
370
|
+
if s_start > 0
|
371
|
+
left = s.string[s_start..(s.pos - 1)]
|
372
|
+
else
|
373
|
+
left = ''
|
374
|
+
end
|
375
|
+
right = s.string[s.pos..s_end]
|
376
|
+
extra = "pos #{s.pos} [#{chunk_start + s.pos}], last #{last_block_pos}"
|
377
|
+
|
378
|
+
raise ParseError, "#{msg} at: '#{left}>><<#{right}' (#{extra})"
|
379
|
+
end
|
380
|
+
|
381
|
+
S = 's'.getbyte(0)
|
382
|
+
I = 'i'.getbyte(0)
|
383
|
+
E = 'e'.getbyte(0)
|
384
|
+
Q = 'q'.getbyte(0)
|
385
|
+
COMMENT = '#'.getbyte(0)
|
386
|
+
|
387
|
+
# Parse a {Block} from the current position. Requires that {#s}
|
388
|
+
# and {#chunk_start} be set correctly.
|
389
|
+
#
|
390
|
+
# @return [Block] the alignment block.
|
391
|
+
def parse_block_data
|
392
|
+
block_start_pos = s.pos
|
393
|
+
block_offset = chunk_start + block_start_pos
|
394
|
+
s.scan(/^a\s*/) || parse_error("bad a line")
|
395
|
+
block_vars = parse_maf_vars()
|
396
|
+
seqs = []
|
397
|
+
payload = s.scan_until(/^(?=a)/)
|
398
|
+
unless payload
|
399
|
+
payload = s.rest
|
400
|
+
s.pos = s.string.size # jump to EOS
|
401
|
+
end
|
402
|
+
lines = payload.split("\n")
|
403
|
+
until lines.empty?
|
404
|
+
line = lines.shift
|
405
|
+
first = line.getbyte(0)
|
406
|
+
if first == S
|
407
|
+
seq = parse_seq_line(line, sequence_filter)
|
408
|
+
seqs << seq if seq
|
409
|
+
elsif first == E && parse_empty
|
410
|
+
e_seq = parse_empty_line(line, sequence_filter)
|
411
|
+
seqs << e_seq if e_seq
|
412
|
+
elsif first == I && parse_extended
|
413
|
+
parts = line.split
|
414
|
+
parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
|
415
|
+
seqs.last.i_data = parts.slice(2..6)
|
416
|
+
elsif first == Q && parse_extended
|
417
|
+
_, src, quality = line.split
|
418
|
+
parse_error("wrong q source #{src}!") unless seqs.last.source == src
|
419
|
+
seqs.last.quality = quality
|
420
|
+
elsif [I, E, Q, COMMENT, nil].include? first
|
421
|
+
next
|
422
|
+
else
|
423
|
+
parse_error "unexpected line: '#{line}'"
|
424
|
+
end
|
425
|
+
end
|
426
|
+
return Block.new(block_vars,
|
427
|
+
seqs,
|
428
|
+
block_offset,
|
429
|
+
s.pos - block_start_pos)
|
430
|
+
end
|
431
|
+
|
432
|
+
# Parse an 's' line.
|
433
|
+
# @return [Sequence]
|
434
|
+
def parse_seq_line(line, filter)
|
435
|
+
_, src, start, size, strand, src_size, text = line.split
|
436
|
+
return nil if filter && ! seq_filter_ok?(src, filter)
|
437
|
+
begin
|
438
|
+
Sequence.new(src,
|
439
|
+
start.to_i,
|
440
|
+
size.to_i,
|
441
|
+
STRAND_SYM.fetch(strand),
|
442
|
+
src_size.to_i,
|
443
|
+
text)
|
444
|
+
rescue KeyError
|
445
|
+
parse_error "invalid sequence line: #{line}"
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
# Parse an 'e' line.
|
450
|
+
# @return [EmptySequence]
|
451
|
+
def parse_empty_line(line, filter)
|
452
|
+
_, src, start, size, strand, src_size, status = line.split
|
453
|
+
return nil if filter && ! seq_filter_ok?(src, filter)
|
454
|
+
begin
|
455
|
+
EmptySequence.new(src,
|
456
|
+
start.to_i,
|
457
|
+
size.to_i,
|
458
|
+
STRAND_SYM.fetch(strand),
|
459
|
+
src_size.to_i,
|
460
|
+
status)
|
461
|
+
rescue KeyError
|
462
|
+
parse_error "invalid empty sequence line: #{line}"
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
# Indicates whether the given sequence source should be parsed,
|
467
|
+
# given the current sequence filters.
|
468
|
+
def seq_filter_ok?(src, filter)
|
469
|
+
if filter[:only_species]
|
470
|
+
src_sp = src.split('.', 2)[0]
|
471
|
+
m = filter[:only_species].find { |sp| src_sp == sp }
|
472
|
+
return m
|
473
|
+
else
|
474
|
+
return true
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
# Parse key-value pairs from the MAF header or an 'a' line.
|
479
|
+
# @return [Hash]
|
480
|
+
def parse_maf_vars
|
481
|
+
vars = {}
|
482
|
+
while s.scan(/(\w+)=(\S*)\s+/) do
|
483
|
+
vars[s[1].to_sym] = s[2]
|
484
|
+
end
|
485
|
+
vars
|
486
|
+
end
|
487
|
+
|
488
|
+
# Does `string` have a trailing newline?
|
489
|
+
def trailing_nl?(string)
|
490
|
+
if string.empty?
|
491
|
+
false
|
492
|
+
else
|
493
|
+
s.string[s.string.size - 1] == "\n"
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
STRAND_SYM = {
|
498
|
+
'+' => :+,
|
499
|
+
'-' => :-
|
500
|
+
}
|
501
|
+
end
|
502
|
+
|
503
|
+
# A MAF parsing context, used for random-access parsing.
|
504
|
+
class ParseContext
|
505
|
+
include MAFParsing
|
506
|
+
attr_accessor :f, :s, :cr, :parser
|
507
|
+
attr_accessor :chunk_start, :last_block_pos, :at_end
|
508
|
+
|
509
|
+
def initialize(fd, chunk_size, parser, opts)
|
510
|
+
@f = fd
|
511
|
+
@parser = parser
|
512
|
+
reader = opts[:chunk_reader] || ChunkReader
|
513
|
+
@cr = reader.new(@f, chunk_size)
|
514
|
+
@last_block_pos = -1
|
515
|
+
end
|
516
|
+
|
517
|
+
def sequence_filter
|
518
|
+
parser.sequence_filter
|
519
|
+
end
|
520
|
+
|
521
|
+
def parse_empty
|
522
|
+
parser.parse_empty
|
523
|
+
end
|
524
|
+
|
525
|
+
def parse_extended
|
526
|
+
parser.parse_extended
|
527
|
+
end
|
528
|
+
|
529
|
+
def set_last_block_pos!
|
530
|
+
@last_block_pos = s.string.rindex(BLOCK_START)
|
531
|
+
end
|
532
|
+
|
533
|
+
# Fetch and parse blocks at given `offset` and `len`
|
534
|
+
# @param [Integer] offset Offset to start parsing at.
|
535
|
+
# @param [Integer] len Number of bytes to read.
|
536
|
+
# @param [Array] block_offsets Offsets of blocks to parse.
|
537
|
+
# @return [Array<Block>]
|
538
|
+
def fetch_blocks(offset, len, block_offsets)
|
539
|
+
start_chunk_read_if_needed(offset, len)
|
540
|
+
# read chunks until we have the entire merged set of
|
541
|
+
# blocks ready to parse
|
542
|
+
# to avoid fragment joining
|
543
|
+
append_chunks_to(len)
|
544
|
+
# parse the blocks
|
545
|
+
Enumerator.new do |y|
|
546
|
+
block_offsets.each do |expected_offset|
|
547
|
+
block = parse_block
|
548
|
+
ctx.parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
|
549
|
+
ctx.parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
|
550
|
+
y << block
|
551
|
+
end
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
def start_chunk_read_if_needed(offset, len)
|
556
|
+
if chunk_start \
|
557
|
+
&& (chunk_start <= offset) \
|
558
|
+
&& (offset < (chunk_start + s.string.size))
|
559
|
+
## the selected offset is in the current chunk
|
560
|
+
s.pos = offset - chunk_start
|
561
|
+
else
|
562
|
+
chunk = cr.read_chunk_at(offset, len)
|
563
|
+
@chunk_start = offset
|
564
|
+
@s = StringScanner.new(chunk)
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
def append_chunks_to(len)
|
569
|
+
# XXX: need to rethink this for BGZF; prefetching ChunkReader
|
570
|
+
while s.string.size < len
|
571
|
+
s.string << cr.read_chunk()
|
572
|
+
end
|
573
|
+
end
|
574
|
+
|
575
|
+
end
|
576
|
+
|
577
|
+
# MAF parser, used for sequential and random-access parsing.
|
578
|
+
#
|
579
|
+
# Options:
|
580
|
+
#
|
581
|
+
# * `:parse_extended`: whether to parse 'i' and 'q' lines
|
582
|
+
# * `:parse_empty`: whether to parse 'e' lines
|
583
|
+
# * `:chunk_size`: read MAF file in chunks of this many bytes
|
584
|
+
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
585
|
+
# * `:merge_max`: merge up to this many bytes of blocks for
|
586
|
+
# random access
|
587
|
+
# * `:chunk_reader`: use the specified class to read
|
588
|
+
# chunks. (Only useful with {ThreadedChunkReader}).
|
589
|
+
# * `:threads`: number of threads to use for parallel
|
590
|
+
# parsing. Only useful under JRuby.
|
591
|
+
# @api public
|
592
|
+
|
593
|
+
class Parser
|
594
|
+
include MAFParsing
|
595
|
+
|
596
|
+
# @return [Header] header of the MAF file being parsed.
|
597
|
+
attr_reader :header
|
598
|
+
# @return [String] path of MAF file being parsed.
|
599
|
+
attr_reader :file_spec
|
600
|
+
# @return [File] file handle for MAF file.
|
601
|
+
attr_reader :f
|
602
|
+
# @return [StringScanner] scanner for parsing.
|
603
|
+
attr_reader :s
|
604
|
+
# @return [ChunkReader] ChunkReader.
|
605
|
+
attr_reader :cr
|
606
|
+
# @return [Boolean] whether EOF has been reached.
|
607
|
+
attr_reader :at_end
|
608
|
+
# @return [Hash] parser options.
|
609
|
+
attr_reader :opts
|
610
|
+
# @return [Integer] starting offset of the current chunk.
|
611
|
+
attr_reader :chunk_start
|
612
|
+
# @return [Integer] offset of the last block start in this chunk.
|
613
|
+
attr_reader :last_block_pos
|
614
|
+
# Sequence filter to apply.
|
615
|
+
# @api public
|
616
|
+
attr_accessor :sequence_filter
|
617
|
+
|
618
|
+
# @api private
|
619
|
+
attr_accessor :parse_extended
|
620
|
+
attr_accessor :parse_empty
|
621
|
+
|
622
|
+
SEQ_CHUNK_SIZE = 131072
|
623
|
+
RANDOM_CHUNK_SIZE = 4096
|
624
|
+
MERGE_MAX = SEQ_CHUNK_SIZE
|
625
|
+
|
626
|
+
# Create a new parser instance.
|
627
|
+
#
|
628
|
+
# @param [String] file_spec path of file to parse.
|
629
|
+
# @param [Hash] opts parser options.
|
630
|
+
# @api public
|
631
|
+
def initialize(file_spec, opts={})
|
632
|
+
@opts = opts
|
633
|
+
chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
|
634
|
+
@random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
|
635
|
+
@merge_max = opts[:merge_max] || MERGE_MAX
|
636
|
+
@parse_extended = opts[:parse_extended] || false
|
637
|
+
@parse_empty = opts[:parse_empty] || false
|
638
|
+
@chunk_start = 0
|
639
|
+
@file_spec = file_spec
|
640
|
+
@f = File.open(file_spec)
|
641
|
+
reader = opts[:chunk_reader] || ChunkReader
|
642
|
+
@cr = reader.new(@f, chunk_size)
|
643
|
+
@s = StringScanner.new(cr.read_chunk())
|
644
|
+
set_last_block_pos!
|
645
|
+
@at_end = false
|
646
|
+
_parse_header()
|
647
|
+
end
|
648
|
+
|
649
|
+
# Create a {ParseContext} for random access, using the given
|
650
|
+
# chunk size.
|
651
|
+
#
|
652
|
+
# @return [ParseContext]
|
653
|
+
# @api private
|
654
|
+
def context(chunk_size)
|
655
|
+
# IO#dup calls dup(2) internally, but seems broken on JRuby...
|
656
|
+
fd = File.open(file_spec)
|
657
|
+
ParseContext.new(fd, chunk_size, self, @opts)
|
658
|
+
end
|
659
|
+
|
660
|
+
# Execute the given block with a {ParseContext} using the given
|
661
|
+
# `chunk_size` as an argument.
|
662
|
+
#
|
663
|
+
# @see #context
|
664
|
+
# @api private
|
665
|
+
def with_context(chunk_size)
|
666
|
+
ctx = context(chunk_size)
|
667
|
+
begin
|
668
|
+
yield ctx
|
669
|
+
ensure
|
670
|
+
ctx.f.close
|
671
|
+
end
|
672
|
+
end
|
673
|
+
|
674
|
+
# Fetch and parse blocks given by `fetch_list`.
|
675
|
+
#
|
676
|
+
# `fetch_list` should be an array of `[offset, length]` tuples.
|
677
|
+
#
|
678
|
+
# @param [Array] fetch_list the fetch list
|
679
|
+
# @return [Array<Block>] the requested alignment blocks
|
680
|
+
def fetch_blocks(fetch_list)
|
681
|
+
merged = merge_fetch_list(fetch_list)
|
682
|
+
if RUBY_PLATFORM == 'java' && @opts.fetch(:threads, 1) > 1
|
683
|
+
fetch_blocks_merged_parallel(merged)
|
684
|
+
else
|
685
|
+
fetch_blocks_merged(merged)
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
# Fetch and parse the blocks given by the merged fetch list.
|
690
|
+
#
|
691
|
+
# @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
|
692
|
+
# @return [Array<Block>] the requested alignment blocks
|
693
|
+
def fetch_blocks_merged(fetch_list)
|
694
|
+
Enumerator.new do |y|
|
695
|
+
start = Time.now
|
696
|
+
total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
|
697
|
+
with_context(@random_access_chunk_size) do |ctx|
|
698
|
+
fetch_list.each do |e|
|
699
|
+
ctx.fetch_blocks(*e).each do |block|
|
700
|
+
y << block
|
701
|
+
#total_size += block.size
|
702
|
+
end
|
703
|
+
end
|
704
|
+
end
|
705
|
+
elapsed = Time.now - start
|
706
|
+
rate = (total_size / 1048576.0) / elapsed
|
707
|
+
$stderr.printf("Fetched blocks in %.3fs, %.1f MB/s.\n",
|
708
|
+
elapsed, rate)
|
709
|
+
end
|
710
|
+
end
|
711
|
+
|
712
|
+
# Fetch and parse the blocks given by the merged fetch list, in
|
713
|
+
# parallel. Uses the number of threads specified by the
|
714
|
+
# `:threads` parser option.
|
715
|
+
#
|
716
|
+
# @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
|
717
|
+
# @return [Array<Block>] the requested alignment blocks
|
718
|
+
def fetch_blocks_merged_parallel(fetch_list)
|
719
|
+
Enumerator.new do |y|
|
720
|
+
total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
|
721
|
+
start = Time.now
|
722
|
+
n_threads = @opts.fetch(:threads, 1)
|
723
|
+
# TODO: break entries up into longer runs for more
|
724
|
+
# sequential I/O
|
725
|
+
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
|
726
|
+
completed = java.util.concurrent.LinkedBlockingQueue.new(128)
|
727
|
+
threads = []
|
728
|
+
n_threads.times { threads << make_worker(jobs, completed) }
|
729
|
+
|
730
|
+
n_completed = 0
|
731
|
+
while (n_completed < fetch_list.size)
|
732
|
+
c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
|
733
|
+
if c.nil?
|
734
|
+
if threads.find { |t| t.alive? }
|
735
|
+
next
|
736
|
+
else
|
737
|
+
raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
|
738
|
+
end
|
739
|
+
end
|
740
|
+
raise "worker failed: #{c}" if c.is_a? Exception
|
741
|
+
c.each do |block|
|
742
|
+
y << block
|
743
|
+
end
|
744
|
+
n_completed += 1
|
745
|
+
end
|
746
|
+
threads.each { |t| t.join }
|
747
|
+
elapsed = Time.now - start
|
748
|
+
$stderr.printf("Fetched blocks from %d threads in %.1fs.\n",
|
749
|
+
n_threads,
|
750
|
+
elapsed)
|
751
|
+
mb = total_size / 1048576.0
|
752
|
+
$stderr.printf("%.3f MB processed (%.1f MB/s).\n",
|
753
|
+
mb,
|
754
|
+
mb / elapsed)
|
755
|
+
end
|
756
|
+
end
|
757
|
+
|
758
|
+
# Create a worker thread for parallel parsing.
|
759
|
+
#
|
760
|
+
# @see #fetch_blocks_merged_parallel
|
761
|
+
def make_worker(jobs, completed)
|
762
|
+
Thread.new do
|
763
|
+
with_context(@random_access_chunk_size) do |ctx|
|
764
|
+
while true
|
765
|
+
req = jobs.poll
|
766
|
+
break unless req
|
767
|
+
begin
|
768
|
+
n_blocks = req[2].size
|
769
|
+
blocks = ctx.fetch_blocks(*req).to_a
|
770
|
+
if blocks.size != n_blocks
|
771
|
+
raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
|
772
|
+
end
|
773
|
+
completed.put(blocks)
|
774
|
+
rescue Exception => e
|
775
|
+
completed.put(e)
|
776
|
+
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
777
|
+
$stderr.puts e.backtrace.join("\n")
|
778
|
+
raise e
|
779
|
+
end
|
780
|
+
end
|
781
|
+
end
|
782
|
+
end
|
783
|
+
end
|
784
|
+
|
785
|
+
# Merge contiguous blocks in the given fetch list, up to
|
786
|
+
# `:merge_max` bytes.
|
787
|
+
#
|
788
|
+
# Returns `[offset, size, [offset1, offset2, ...]]` tuples.
|
789
|
+
def merge_fetch_list(orig_fl)
|
790
|
+
fl = orig_fl.dup
|
791
|
+
r = []
|
792
|
+
until fl.empty? do
|
793
|
+
cur = fl.shift
|
794
|
+
if r.last \
|
795
|
+
&& (r.last[0] + r.last[1]) == cur[0] \
|
796
|
+
&& (r.last[1] + cur[1]) <= @merge_max
|
797
|
+
# contiguous with the previous one
|
798
|
+
# add to length and increment count
|
799
|
+
r.last[1] += cur[1]
|
800
|
+
r.last[2] << cur[0]
|
801
|
+
else
|
802
|
+
cur << [cur[0]]
|
803
|
+
r << cur
|
804
|
+
end
|
805
|
+
end
|
806
|
+
return r
|
807
|
+
end
|
808
|
+
|
809
|
+
# Parse the header of the MAF file.
|
810
|
+
def _parse_header
|
811
|
+
parse_error("not a MAF file") unless s.scan(/##maf\s*/)
|
812
|
+
vars = parse_maf_vars()
|
813
|
+
align_params = nil
|
814
|
+
while s.scan(/^#\s*(.+?)\n/)
|
815
|
+
if align_params == nil
|
816
|
+
align_params = s[1]
|
817
|
+
else
|
818
|
+
align_params << ' ' << s[1]
|
819
|
+
end
|
820
|
+
end
|
821
|
+
@header = Header.new(vars, align_params)
|
822
|
+
s.skip_until BLOCK_START || parse_error("Cannot find block start!")
|
823
|
+
end
|
824
|
+
|
825
|
+
# Parse all alignment blocks until EOF.
|
826
|
+
#
|
827
|
+
# Delegates to {#parse_blocks_parallel} if `:threads` is set
|
828
|
+
# under JRuby.
|
829
|
+
#
|
830
|
+
# @return [Enumerator<Block>] enumerator of alignment blocks.
|
831
|
+
# @api public
|
832
|
+
def parse_blocks
|
833
|
+
if RUBY_PLATFORM == 'java' && @opts.has_key?(:threads)
|
834
|
+
parse_blocks_parallel
|
835
|
+
else
|
836
|
+
Enumerator.new do |y|
|
837
|
+
until at_end
|
838
|
+
y << parse_block()
|
839
|
+
end
|
840
|
+
end
|
841
|
+
end
|
842
|
+
end
|
843
|
+
|
844
|
+
# Parse alignment blocks with a worker thread.
|
845
|
+
#
|
846
|
+
# @return [Enumerator<Block>] enumerator of alignment blocks.
|
847
|
+
# @api private
|
848
|
+
def parse_blocks_parallel
|
849
|
+
queue = java.util.concurrent.LinkedBlockingQueue.new(128)
|
850
|
+
worker = Thread.new do
|
851
|
+
begin
|
852
|
+
until at_end
|
853
|
+
queue.put(parse_block())
|
854
|
+
end
|
855
|
+
queue.put(:eof)
|
856
|
+
rescue
|
857
|
+
$stderr.puts "worker exiting: #{$!.class}: #{$!}"
|
858
|
+
$stderr.puts $!.backtrace.join("\n")
|
859
|
+
end
|
860
|
+
end
|
861
|
+
Enumerator.new do |y|
|
862
|
+
saw_eof = false
|
863
|
+
while worker.alive?
|
864
|
+
block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
|
865
|
+
if block == :eof
|
866
|
+
saw_eof = true
|
867
|
+
break
|
868
|
+
elsif block
|
869
|
+
y << block
|
870
|
+
end
|
871
|
+
end
|
872
|
+
unless saw_eof
|
873
|
+
raise "worker exited unexpectedly!"
|
874
|
+
end
|
875
|
+
end
|
876
|
+
end
|
877
|
+
|
878
|
+
def each_block
|
879
|
+
until at_end
|
880
|
+
yield parse_block()
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
end
|
885
|
+
|
886
|
+
end
|
887
|
+
|
888
|
+
end
|