bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,888 @@
1
+ require 'strscan'
2
+ require 'java' if RUBY_PLATFORM == 'java'
3
+
4
+ # @api public
5
+ module Bio
6
+ # @api public
7
+ module MAF
8
+
9
+ # @api public
10
+ class ParseError < Exception; end
11
+
12
+ # A MAF header, containing the variable-value pairs from the first
13
+ # line of the file as well as the alignment parameters.
14
+ # @api public
15
+ class Header
16
+ # Variable-value pairs from the ##maf line
17
+ # @return [Hash]
18
+ attr_accessor :vars
19
+ # Alignment parameters from the MAF header.
20
+ # @return [Hash]
21
+ attr_accessor :alignment_params
22
+
23
+ def initialize(vars, params)
24
+ @vars = vars
25
+ @alignment_params = params
26
+ end
27
+
28
+ # The required version parameter.
29
+ # @return [String]
30
+ def version
31
+ vars[:version]
32
+ end
33
+
34
+ # The optional scoring parameter, if present.
35
+ # @return [String]
36
+ def scoring
37
+ vars[:scoring]
38
+ end
39
+
40
+ end
41
+
42
+ # A MAF alignment block.
43
+ # @api public
44
+ class Block
45
+ # Parameters from the 'a' line starting the alignment block.
46
+ attr_reader :vars
47
+ # Sequences, one per 's' or 'e' line.
48
+ # @return [Array<Sequence>]
49
+ attr_reader :sequences
50
+ # Offset of the alignment block within the MAF file, in bytes.
51
+ # @return [Integer]
52
+ attr_reader :offset
53
+ # Size of the alignment block within the MAF file, in bytes.
54
+ # @return [Integer]
55
+ attr_reader :size
56
+
57
+ def initialize(*args)
58
+ @vars, @sequences, @offset, @size = args
59
+ end
60
+
61
+ def raw_seq(i)
62
+ sequences.fetch(i)
63
+ end
64
+
65
+ def each_raw_seq
66
+ sequences.each { |s| yield s }
67
+ end
68
+
69
+ # Text size of the alignment block. This is the number of text
70
+ # characters in each line of sequence data, including dashes and
71
+ # other gaps in the sequence.
72
+ def text_size
73
+ sequences.first.text.size
74
+ end
75
+
76
+ end
77
+
78
+ # A sequence within an alignment block.
79
+ # @api public
80
+ class Sequence
81
+ # @return [String] Source sequence name.
82
+ attr_reader :source
83
+ # @return [Integer] Zero-based start position.
84
+ attr_reader :start
85
+ # @return [Integer] Size of aligning region in source sequence.
86
+ attr_reader :size
87
+ # :+ or :-, indicating which strand the alignment is to.
88
+ # @return [Symbol]
89
+ attr_reader :strand
90
+ # Size of the entire source sequence, not just the aligning
91
+ # region.
92
+ # @return [Integer]
93
+ attr_reader :src_size
94
+ # Sequence data for the alignment, including insertions.
95
+ # @return [String]
96
+ attr_reader :text
97
+ # Array of raw synteny information from 'i' line.
98
+ # @return [Array<String>]
99
+ attr_accessor :i_data
100
+ # Quality string from 'q' line.
101
+ # @return [String]
102
+ attr_accessor :quality
103
+ alias_method :source_size, :src_size
104
+
105
+ def initialize(*args)
106
+ @source, @start, @size, @strand, @src_size, @text = args
107
+ end
108
+
109
+ # Whether this sequence is empty. Only true for {EmptySequence}
110
+ # instances from 'e' lines.
111
+ def empty?
112
+ false
113
+ end
114
+
115
+ def write_fasta(writer)
116
+ writer.write("#{source}:#{start}-#{start + size}",
117
+ text)
118
+ end
119
+ end
120
+
121
+ # An empty sequence record from an 'e' line.
122
+ #
123
+ # This indicates that "there isn't aligning DNA for a species but
124
+ # that the current block is bridged by a chain that connects
125
+ # blocks before and after this block" (MAF spec).
126
+ # @api public
127
+ class EmptySequence < Sequence
128
+ attr_reader :status
129
+
130
+ def initialize(*args)
131
+ super(*args[0..4])
132
+ @status = args[5]
133
+ end
134
+
135
+ def text
136
+ ''
137
+ end
138
+
139
+ def empty?
140
+ true
141
+ end
142
+
143
+ def write_fasta(writer)
144
+ raise "empty sequence output not implemented!"
145
+ end
146
+ end
147
+
148
+ # Reads MAF files in chunks.
149
+ # @api private
150
+ class ChunkReader
151
+ # Size, in bytes, of the chunks to read. Must be a power of 2.
152
+ # @return [Integer]
153
+ attr_accessor :chunk_size
154
+ # Current position in the file.
155
+ # @return [Integer]
156
+ attr_accessor :pos
157
+ # {File} from which chunks are read.
158
+ # @return [File]
159
+ attr_reader :f
160
+ def initialize(f, chunk_size)
161
+ @f = f
162
+ self.chunk_size = chunk_size
163
+ @pos = 0
164
+ end
165
+
166
+ def chunk_size=(size)
167
+ check_chunk_size(size)
168
+ @chunk_size = size
169
+ # power of 2 so don't worry about rounding
170
+ # @chunk_shift = Math.log2(size).to_i
171
+ end
172
+
173
+ def check_chunk_size(size)
174
+ if size < 1
175
+ raise "Invalid chunk size: #{size}"
176
+ end
177
+ ## test whether it is a power of 2
178
+ ## cf. http://bit.ly/JExNc4
179
+ if size & (size - 1) != 0
180
+ raise "Invalid chunk size (not a power of 2): #{size}}"
181
+ end
182
+ end
183
+
184
+ # Reads the next chunk of the file.
185
+ # @return [String] Next {#chunk_size} bytes of MAF data.
186
+ def read_chunk
187
+ chunk = f.read(@chunk_size)
188
+ @pos += chunk.bytesize if chunk
189
+ return chunk
190
+ end
191
+
192
+ # Reads a chunk of the file.
193
+ #
194
+ # Currently always reads size_hint bytes but this may change
195
+ # with BGZF support.
196
+ #
197
+ # @param [Integer] offset file offset to read from.
198
+ # @param [Integer] size_hint desired size of chunk.
199
+ # @return [String] Chunk of MAF data.
200
+ def read_chunk_at(offset, size_hint=@chunk_size)
201
+ f.seek(offset)
202
+ chunk = f.read(size_hint)
203
+ @pos = offset + chunk.bytesize
204
+ return chunk
205
+ end
206
+ end
207
+
208
+ # Variant ChunkReader using a read-ahead thread with internal
209
+ # queue for sequential parsing. Not useful for random-access
210
+ # parsing.
211
+ #
212
+ # Only beneficial on JRuby.
213
+ class ThreadedChunkReader < ChunkReader
214
+
215
+ def initialize(f, chunk_size, buffer_size=64)
216
+ super(f, chunk_size)
217
+ @buffer = SizedQueue.new(buffer_size)
218
+ @eof_reached = false
219
+ start_read_ahead
220
+ end
221
+
222
+ # Spawn a read-ahead thread. Called from {#initialize}.
223
+ def start_read_ahead
224
+ @read_thread = Thread.new { read_ahead }
225
+ end
226
+
227
+ # Read ahead into queue.
228
+ def read_ahead
229
+ # n = 0
230
+ begin
231
+ f_pos = 0
232
+ until f.eof?
233
+ chunk = f.read(@chunk_size)
234
+ @buffer << [f_pos, chunk]
235
+ f_pos += chunk.bytesize
236
+ # n += 1
237
+ # if (n % 100) == 0
238
+ # $stderr.puts "buffer size: #{@buffer.size}"
239
+ # end
240
+ end
241
+ @eof_reached = true
242
+ rescue Exception
243
+ @read_ahead_ex = $!
244
+ $stderr.puts "read_ahead aborting: #{$!}"
245
+ end
246
+ end
247
+
248
+ # (see ChunkReader#read_chunk)
249
+ def read_chunk
250
+ raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
251
+ if @eof_reached && @buffer.empty?
252
+ return nil
253
+ else
254
+ c_pos, chunk = @buffer.shift()
255
+ @pos = c_pos
256
+ return chunk
257
+ end
258
+ end
259
+
260
+ end
261
+
262
+ # MAF parsing code useful for sequential and random-access parsing.
263
+ module MAFParsing
264
+
265
+ BLOCK_START = /^(?=a)/
266
+ BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
267
+ EOL_OR_EOF = /\n|\z/
268
+
269
+ def set_last_block_pos!
270
+ @last_block_pos = s.string.rindex(BLOCK_START)
271
+ end
272
+
273
+ ## On finding the start of a block:
274
+ ## See whether we are at the last block in the chunk.
275
+ ## If at the last block:
276
+ ## If at EOF: last block.
277
+ ## If not:
278
+ ## Read the next chunk
279
+ ## Find the start of the next block in that chunk
280
+ ## Concatenate the two block fragments
281
+ ## Parse the resulting block
282
+ ## Promote the next scanner, positioned
283
+
284
+ # Parse the block at the current position, joining fragments
285
+ # across chunk boundaries if necessary.
286
+ #
287
+ # @return [Block] alignment block
288
+ # @api public
289
+ def parse_block
290
+ return nil if at_end
291
+ if s.pos != last_block_pos
292
+ # in non-trailing block
293
+ parse_block_data
294
+ else
295
+ # in trailing block fragment
296
+ parse_trailing_fragment
297
+ end
298
+ end
299
+
300
+ ## Read chunks and accumulate a leading fragment until we
301
+ ## encounter a block start or EOF.
302
+ def gather_leading_fragment
303
+ leading_frag = ''
304
+ while true
305
+ next_chunk_start = cr.pos
306
+ next_chunk = cr.read_chunk
307
+ if next_chunk
308
+ next_scanner = StringScanner.new(next_chunk)
309
+ # If this trailing fragment ends with a newline, then an
310
+ # 'a' at the beginning of the leading fragment is the
311
+ # start of the next alignment block.
312
+ if trailing_nl?(leading_frag) || trailing_nl?(s.string)
313
+ pat = BLOCK_START
314
+ else
315
+ pat = /(?:\n(?=a))/
316
+ end
317
+ frag = next_scanner.scan_until(pat)
318
+ if frag
319
+ # got block start
320
+ leading_frag << frag
321
+ break
322
+ else
323
+ # no block start in this
324
+ leading_frag << next_chunk
325
+ end
326
+ else
327
+ # EOF
328
+ @at_end = true
329
+ break
330
+ end
331
+ end
332
+ return leading_frag, next_scanner, next_chunk_start
333
+ end
334
+
335
+ # Join the trailing fragment of the current chunk with the
336
+ # leading fragment of the next chunk and parse the resulting
337
+ # block.
338
+ #
339
+ # @return [Block] the alignment block.
340
+
341
+ def parse_trailing_fragment
342
+ leading_frag, next_scanner, next_chunk_start = gather_leading_fragment
343
+ # join fragments and parse
344
+ trailing_frag = s.rest
345
+ joined_block = trailing_frag + leading_frag
346
+ @chunk_start = chunk_start + s.pos
347
+ @s = StringScanner.new(joined_block)
348
+ begin
349
+ block = parse_block_data
350
+ rescue ParseError => pe
351
+ parse_error "Could not parse joined fragments: #{pe}\nTRAILING: #{trailing_frag}\nLEADING: #{leading_frag}"
352
+ end
353
+ # Set up to parse the next block
354
+ @s = next_scanner
355
+ @chunk_start = next_chunk_start
356
+ unless @at_end
357
+ set_last_block_pos!
358
+ end
359
+ return block
360
+ end
361
+
362
+ # Raise a {ParseError}, indicating position within the MAF file
363
+ # and the chunk as well as the text surrounding the current
364
+ # scanner position.
365
+ #
366
+ # @param [String] msg the error message
367
+ def parse_error(msg)
368
+ s_start = [s.pos - 10, 0].max
369
+ s_end = [s.pos + 10, s.string.length].min
370
+ if s_start > 0
371
+ left = s.string[s_start..(s.pos - 1)]
372
+ else
373
+ left = ''
374
+ end
375
+ right = s.string[s.pos..s_end]
376
+ extra = "pos #{s.pos} [#{chunk_start + s.pos}], last #{last_block_pos}"
377
+
378
+ raise ParseError, "#{msg} at: '#{left}>><<#{right}' (#{extra})"
379
+ end
380
+
381
+ S = 's'.getbyte(0)
382
+ I = 'i'.getbyte(0)
383
+ E = 'e'.getbyte(0)
384
+ Q = 'q'.getbyte(0)
385
+ COMMENT = '#'.getbyte(0)
386
+
387
+ # Parse a {Block} from the current position. Requires that {#s}
388
+ # and {#chunk_start} be set correctly.
389
+ #
390
+ # @return [Block] the alignment block.
391
+ def parse_block_data
392
+ block_start_pos = s.pos
393
+ block_offset = chunk_start + block_start_pos
394
+ s.scan(/^a\s*/) || parse_error("bad a line")
395
+ block_vars = parse_maf_vars()
396
+ seqs = []
397
+ payload = s.scan_until(/^(?=a)/)
398
+ unless payload
399
+ payload = s.rest
400
+ s.pos = s.string.size # jump to EOS
401
+ end
402
+ lines = payload.split("\n")
403
+ until lines.empty?
404
+ line = lines.shift
405
+ first = line.getbyte(0)
406
+ if first == S
407
+ seq = parse_seq_line(line, sequence_filter)
408
+ seqs << seq if seq
409
+ elsif first == E && parse_empty
410
+ e_seq = parse_empty_line(line, sequence_filter)
411
+ seqs << e_seq if e_seq
412
+ elsif first == I && parse_extended
413
+ parts = line.split
414
+ parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
415
+ seqs.last.i_data = parts.slice(2..6)
416
+ elsif first == Q && parse_extended
417
+ _, src, quality = line.split
418
+ parse_error("wrong q source #{src}!") unless seqs.last.source == src
419
+ seqs.last.quality = quality
420
+ elsif [I, E, Q, COMMENT, nil].include? first
421
+ next
422
+ else
423
+ parse_error "unexpected line: '#{line}'"
424
+ end
425
+ end
426
+ return Block.new(block_vars,
427
+ seqs,
428
+ block_offset,
429
+ s.pos - block_start_pos)
430
+ end
431
+
432
+ # Parse an 's' line.
433
+ # @return [Sequence]
434
+ def parse_seq_line(line, filter)
435
+ _, src, start, size, strand, src_size, text = line.split
436
+ return nil if filter && ! seq_filter_ok?(src, filter)
437
+ begin
438
+ Sequence.new(src,
439
+ start.to_i,
440
+ size.to_i,
441
+ STRAND_SYM.fetch(strand),
442
+ src_size.to_i,
443
+ text)
444
+ rescue KeyError
445
+ parse_error "invalid sequence line: #{line}"
446
+ end
447
+ end
448
+
449
+ # Parse an 'e' line.
450
+ # @return [EmptySequence]
451
+ def parse_empty_line(line, filter)
452
+ _, src, start, size, strand, src_size, status = line.split
453
+ return nil if filter && ! seq_filter_ok?(src, filter)
454
+ begin
455
+ EmptySequence.new(src,
456
+ start.to_i,
457
+ size.to_i,
458
+ STRAND_SYM.fetch(strand),
459
+ src_size.to_i,
460
+ status)
461
+ rescue KeyError
462
+ parse_error "invalid empty sequence line: #{line}"
463
+ end
464
+ end
465
+
466
+ # Indicates whether the given sequence source should be parsed,
467
+ # given the current sequence filters.
468
+ def seq_filter_ok?(src, filter)
469
+ if filter[:only_species]
470
+ src_sp = src.split('.', 2)[0]
471
+ m = filter[:only_species].find { |sp| src_sp == sp }
472
+ return m
473
+ else
474
+ return true
475
+ end
476
+ end
477
+
478
+ # Parse key-value pairs from the MAF header or an 'a' line.
479
+ # @return [Hash]
480
+ def parse_maf_vars
481
+ vars = {}
482
+ while s.scan(/(\w+)=(\S*)\s+/) do
483
+ vars[s[1].to_sym] = s[2]
484
+ end
485
+ vars
486
+ end
487
+
488
+ # Does `string` have a trailing newline?
489
+ def trailing_nl?(string)
490
+ if string.empty?
491
+ false
492
+ else
493
+ s.string[s.string.size - 1] == "\n"
494
+ end
495
+ end
496
+
497
+ STRAND_SYM = {
498
+ '+' => :+,
499
+ '-' => :-
500
+ }
501
+ end
502
+
503
+ # A MAF parsing context, used for random-access parsing.
504
+ class ParseContext
505
+ include MAFParsing
506
+ attr_accessor :f, :s, :cr, :parser
507
+ attr_accessor :chunk_start, :last_block_pos, :at_end
508
+
509
+ def initialize(fd, chunk_size, parser, opts)
510
+ @f = fd
511
+ @parser = parser
512
+ reader = opts[:chunk_reader] || ChunkReader
513
+ @cr = reader.new(@f, chunk_size)
514
+ @last_block_pos = -1
515
+ end
516
+
517
+ def sequence_filter
518
+ parser.sequence_filter
519
+ end
520
+
521
+ def parse_empty
522
+ parser.parse_empty
523
+ end
524
+
525
+ def parse_extended
526
+ parser.parse_extended
527
+ end
528
+
529
+ def set_last_block_pos!
530
+ @last_block_pos = s.string.rindex(BLOCK_START)
531
+ end
532
+
533
+ # Fetch and parse blocks at given `offset` and `len`
534
+ # @param [Integer] offset Offset to start parsing at.
535
+ # @param [Integer] len Number of bytes to read.
536
+ # @param [Array] block_offsets Offsets of blocks to parse.
537
+ # @return [Array<Block>]
538
+ def fetch_blocks(offset, len, block_offsets)
539
+ start_chunk_read_if_needed(offset, len)
540
+ # read chunks until we have the entire merged set of
541
+ # blocks ready to parse
542
+ # to avoid fragment joining
543
+ append_chunks_to(len)
544
+ # parse the blocks
545
+ Enumerator.new do |y|
546
+ block_offsets.each do |expected_offset|
547
+ block = parse_block
548
+ ctx.parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
549
+ ctx.parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
550
+ y << block
551
+ end
552
+ end
553
+ end
554
+
555
+ def start_chunk_read_if_needed(offset, len)
556
+ if chunk_start \
557
+ && (chunk_start <= offset) \
558
+ && (offset < (chunk_start + s.string.size))
559
+ ## the selected offset is in the current chunk
560
+ s.pos = offset - chunk_start
561
+ else
562
+ chunk = cr.read_chunk_at(offset, len)
563
+ @chunk_start = offset
564
+ @s = StringScanner.new(chunk)
565
+ end
566
+ end
567
+
568
+ def append_chunks_to(len)
569
+ # XXX: need to rethink this for BGZF; prefetching ChunkReader
570
+ while s.string.size < len
571
+ s.string << cr.read_chunk()
572
+ end
573
+ end
574
+
575
+ end
576
+
577
+ # MAF parser, used for sequential and random-access parsing.
578
+ #
579
+ # Options:
580
+ #
581
+ # * `:parse_extended`: whether to parse 'i' and 'q' lines
582
+ # * `:parse_empty`: whether to parse 'e' lines
583
+ # * `:chunk_size`: read MAF file in chunks of this many bytes
584
+ # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
585
+ # * `:merge_max`: merge up to this many bytes of blocks for
586
+ # random access
587
+ # * `:chunk_reader`: use the specified class to read
588
+ # chunks. (Only useful with {ThreadedChunkReader}).
589
+ # * `:threads`: number of threads to use for parallel
590
+ # parsing. Only useful under JRuby.
591
+ # @api public
592
+
593
+ class Parser
594
+ include MAFParsing
595
+
596
+ # @return [Header] header of the MAF file being parsed.
597
+ attr_reader :header
598
+ # @return [String] path of MAF file being parsed.
599
+ attr_reader :file_spec
600
+ # @return [File] file handle for MAF file.
601
+ attr_reader :f
602
+ # @return [StringScanner] scanner for parsing.
603
+ attr_reader :s
604
+ # @return [ChunkReader] ChunkReader.
605
+ attr_reader :cr
606
+ # @return [Boolean] whether EOF has been reached.
607
+ attr_reader :at_end
608
+ # @return [Hash] parser options.
609
+ attr_reader :opts
610
+ # @return [Integer] starting offset of the current chunk.
611
+ attr_reader :chunk_start
612
+ # @return [Integer] offset of the last block start in this chunk.
613
+ attr_reader :last_block_pos
614
+ # Sequence filter to apply.
615
+ # @api public
616
+ attr_accessor :sequence_filter
617
+
618
+ # @api private
619
+ attr_accessor :parse_extended
620
+ attr_accessor :parse_empty
621
+
622
+ SEQ_CHUNK_SIZE = 131072
623
+ RANDOM_CHUNK_SIZE = 4096
624
+ MERGE_MAX = SEQ_CHUNK_SIZE
625
+
626
+ # Create a new parser instance.
627
+ #
628
+ # @param [String] file_spec path of file to parse.
629
+ # @param [Hash] opts parser options.
630
+ # @api public
631
+ def initialize(file_spec, opts={})
632
+ @opts = opts
633
+ chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
634
+ @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
635
+ @merge_max = opts[:merge_max] || MERGE_MAX
636
+ @parse_extended = opts[:parse_extended] || false
637
+ @parse_empty = opts[:parse_empty] || false
638
+ @chunk_start = 0
639
+ @file_spec = file_spec
640
+ @f = File.open(file_spec)
641
+ reader = opts[:chunk_reader] || ChunkReader
642
+ @cr = reader.new(@f, chunk_size)
643
+ @s = StringScanner.new(cr.read_chunk())
644
+ set_last_block_pos!
645
+ @at_end = false
646
+ _parse_header()
647
+ end
648
+
649
+ # Create a {ParseContext} for random access, using the given
650
+ # chunk size.
651
+ #
652
+ # @return [ParseContext]
653
+ # @api private
654
+ def context(chunk_size)
655
+ # IO#dup calls dup(2) internally, but seems broken on JRuby...
656
+ fd = File.open(file_spec)
657
+ ParseContext.new(fd, chunk_size, self, @opts)
658
+ end
659
+
660
+ # Execute the given block with a {ParseContext} using the given
661
+ # `chunk_size` as an argument.
662
+ #
663
+ # @see #context
664
+ # @api private
665
+ def with_context(chunk_size)
666
+ ctx = context(chunk_size)
667
+ begin
668
+ yield ctx
669
+ ensure
670
+ ctx.f.close
671
+ end
672
+ end
673
+
674
+ # Fetch and parse blocks given by `fetch_list`.
675
+ #
676
+ # `fetch_list` should be an array of `[offset, length]` tuples.
677
+ #
678
+ # @param [Array] fetch_list the fetch list
679
+ # @return [Array<Block>] the requested alignment blocks
680
+ def fetch_blocks(fetch_list)
681
+ merged = merge_fetch_list(fetch_list)
682
+ if RUBY_PLATFORM == 'java' && @opts.fetch(:threads, 1) > 1
683
+ fetch_blocks_merged_parallel(merged)
684
+ else
685
+ fetch_blocks_merged(merged)
686
+ end
687
+ end
688
+
689
+ # Fetch and parse the blocks given by the merged fetch list.
690
+ #
691
+ # @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
692
+ # @return [Array<Block>] the requested alignment blocks
693
+ def fetch_blocks_merged(fetch_list)
694
+ Enumerator.new do |y|
695
+ start = Time.now
696
+ total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
697
+ with_context(@random_access_chunk_size) do |ctx|
698
+ fetch_list.each do |e|
699
+ ctx.fetch_blocks(*e).each do |block|
700
+ y << block
701
+ #total_size += block.size
702
+ end
703
+ end
704
+ end
705
+ elapsed = Time.now - start
706
+ rate = (total_size / 1048576.0) / elapsed
707
+ $stderr.printf("Fetched blocks in %.3fs, %.1f MB/s.\n",
708
+ elapsed, rate)
709
+ end
710
+ end
711
+
712
+ # Fetch and parse the blocks given by the merged fetch list, in
713
+ # parallel. Uses the number of threads specified by the
714
+ # `:threads` parser option.
715
+ #
716
+ # @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
717
+ # @return [Array<Block>] the requested alignment blocks
718
+ def fetch_blocks_merged_parallel(fetch_list)
719
+ Enumerator.new do |y|
720
+ total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
721
+ start = Time.now
722
+ n_threads = @opts.fetch(:threads, 1)
723
+ # TODO: break entries up into longer runs for more
724
+ # sequential I/O
725
+ jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
726
+ completed = java.util.concurrent.LinkedBlockingQueue.new(128)
727
+ threads = []
728
+ n_threads.times { threads << make_worker(jobs, completed) }
729
+
730
+ n_completed = 0
731
+ while (n_completed < fetch_list.size)
732
+ c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
733
+ if c.nil?
734
+ if threads.find { |t| t.alive? }
735
+ next
736
+ else
737
+ raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
738
+ end
739
+ end
740
+ raise "worker failed: #{c}" if c.is_a? Exception
741
+ c.each do |block|
742
+ y << block
743
+ end
744
+ n_completed += 1
745
+ end
746
+ threads.each { |t| t.join }
747
+ elapsed = Time.now - start
748
+ $stderr.printf("Fetched blocks from %d threads in %.1fs.\n",
749
+ n_threads,
750
+ elapsed)
751
+ mb = total_size / 1048576.0
752
+ $stderr.printf("%.3f MB processed (%.1f MB/s).\n",
753
+ mb,
754
+ mb / elapsed)
755
+ end
756
+ end
757
+
758
+ # Create a worker thread for parallel parsing.
759
+ #
760
+ # @see #fetch_blocks_merged_parallel
761
+ def make_worker(jobs, completed)
762
+ Thread.new do
763
+ with_context(@random_access_chunk_size) do |ctx|
764
+ while true
765
+ req = jobs.poll
766
+ break unless req
767
+ begin
768
+ n_blocks = req[2].size
769
+ blocks = ctx.fetch_blocks(*req).to_a
770
+ if blocks.size != n_blocks
771
+ raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
772
+ end
773
+ completed.put(blocks)
774
+ rescue Exception => e
775
+ completed.put(e)
776
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
777
+ $stderr.puts e.backtrace.join("\n")
778
+ raise e
779
+ end
780
+ end
781
+ end
782
+ end
783
+ end
784
+
785
+ # Merge contiguous blocks in the given fetch list, up to
786
+ # `:merge_max` bytes.
787
+ #
788
+ # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
789
+ def merge_fetch_list(orig_fl)
790
+ fl = orig_fl.dup
791
+ r = []
792
+ until fl.empty? do
793
+ cur = fl.shift
794
+ if r.last \
795
+ && (r.last[0] + r.last[1]) == cur[0] \
796
+ && (r.last[1] + cur[1]) <= @merge_max
797
+ # contiguous with the previous one
798
+ # add to length and increment count
799
+ r.last[1] += cur[1]
800
+ r.last[2] << cur[0]
801
+ else
802
+ cur << [cur[0]]
803
+ r << cur
804
+ end
805
+ end
806
+ return r
807
+ end
808
+
809
+ # Parse the header of the MAF file.
810
+ def _parse_header
811
+ parse_error("not a MAF file") unless s.scan(/##maf\s*/)
812
+ vars = parse_maf_vars()
813
+ align_params = nil
814
+ while s.scan(/^#\s*(.+?)\n/)
815
+ if align_params == nil
816
+ align_params = s[1]
817
+ else
818
+ align_params << ' ' << s[1]
819
+ end
820
+ end
821
+ @header = Header.new(vars, align_params)
822
+ s.skip_until BLOCK_START || parse_error("Cannot find block start!")
823
+ end
824
+
825
+ # Parse all alignment blocks until EOF.
826
+ #
827
+ # Delegates to {#parse_blocks_parallel} if `:threads` is set
828
+ # under JRuby.
829
+ #
830
+ # @return [Enumerator<Block>] enumerator of alignment blocks.
831
+ # @api public
832
+ def parse_blocks
833
+ if RUBY_PLATFORM == 'java' && @opts.has_key?(:threads)
834
+ parse_blocks_parallel
835
+ else
836
+ Enumerator.new do |y|
837
+ until at_end
838
+ y << parse_block()
839
+ end
840
+ end
841
+ end
842
+ end
843
+
844
+ # Parse alignment blocks with a worker thread.
845
+ #
846
+ # @return [Enumerator<Block>] enumerator of alignment blocks.
847
+ # @api private
848
+ def parse_blocks_parallel
849
+ queue = java.util.concurrent.LinkedBlockingQueue.new(128)
850
+ worker = Thread.new do
851
+ begin
852
+ until at_end
853
+ queue.put(parse_block())
854
+ end
855
+ queue.put(:eof)
856
+ rescue
857
+ $stderr.puts "worker exiting: #{$!.class}: #{$!}"
858
+ $stderr.puts $!.backtrace.join("\n")
859
+ end
860
+ end
861
+ Enumerator.new do |y|
862
+ saw_eof = false
863
+ while worker.alive?
864
+ block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
865
+ if block == :eof
866
+ saw_eof = true
867
+ break
868
+ elsif block
869
+ y << block
870
+ end
871
+ end
872
+ unless saw_eof
873
+ raise "worker exited unexpectedly!"
874
+ end
875
+ end
876
+ end
877
+
878
+ def each_block
879
+ until at_end
880
+ yield parse_block()
881
+ end
882
+ end
883
+
884
+ end
885
+
886
+ end
887
+
888
+ end