bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,888 @@
1
+ require 'strscan'
2
+ require 'java' if RUBY_PLATFORM == 'java'
3
+
4
+ # @api public
5
+ module Bio
6
+ # @api public
7
+ module MAF
8
+
9
+ # @api public
10
+ class ParseError < Exception; end
11
+
12
+ # A MAF header, containing the variable-value pairs from the first
13
+ # line of the file as well as the alignment parameters.
14
+ # @api public
15
+ class Header
16
+ # Variable-value pairs from the ##maf line
17
+ # @return [Hash]
18
+ attr_accessor :vars
19
+ # Alignment parameters from the MAF header.
20
+ # @return [Hash]
21
+ attr_accessor :alignment_params
22
+
23
+ def initialize(vars, params)
24
+ @vars = vars
25
+ @alignment_params = params
26
+ end
27
+
28
+ # The required version parameter.
29
+ # @return [String]
30
+ def version
31
+ vars[:version]
32
+ end
33
+
34
+ # The optional scoring parameter, if present.
35
+ # @return [String]
36
+ def scoring
37
+ vars[:scoring]
38
+ end
39
+
40
+ end
41
+
42
+ # A MAF alignment block.
43
+ # @api public
44
+ class Block
45
+ # Parameters from the 'a' line starting the alignment block.
46
+ attr_reader :vars
47
+ # Sequences, one per 's' or 'e' line.
48
+ # @return [Array<Sequence>]
49
+ attr_reader :sequences
50
+ # Offset of the alignment block within the MAF file, in bytes.
51
+ # @return [Integer]
52
+ attr_reader :offset
53
+ # Size of the alignment block within the MAF file, in bytes.
54
+ # @return [Integer]
55
+ attr_reader :size
56
+
57
+ def initialize(*args)
58
+ @vars, @sequences, @offset, @size = args
59
+ end
60
+
61
+ def raw_seq(i)
62
+ sequences.fetch(i)
63
+ end
64
+
65
+ def each_raw_seq
66
+ sequences.each { |s| yield s }
67
+ end
68
+
69
+ # Text size of the alignment block. This is the number of text
70
+ # characters in each line of sequence data, including dashes and
71
+ # other gaps in the sequence.
72
+ def text_size
73
+ sequences.first.text.size
74
+ end
75
+
76
+ end
77
+
78
+ # A sequence within an alignment block.
79
+ # @api public
80
+ class Sequence
81
+ # @return [String] Source sequence name.
82
+ attr_reader :source
83
+ # @return [Integer] Zero-based start position.
84
+ attr_reader :start
85
+ # @return [Integer] Size of aligning region in source sequence.
86
+ attr_reader :size
87
+ # :+ or :-, indicating which strand the alignment is to.
88
+ # @return [Symbol]
89
+ attr_reader :strand
90
+ # Size of the entire source sequence, not just the aligning
91
+ # region.
92
+ # @return [Integer]
93
+ attr_reader :src_size
94
+ # Sequence data for the alignment, including insertions.
95
+ # @return [String]
96
+ attr_reader :text
97
+ # Array of raw synteny information from 'i' line.
98
+ # @return [Array<String>]
99
+ attr_accessor :i_data
100
+ # Quality string from 'q' line.
101
+ # @return [String]
102
+ attr_accessor :quality
103
+ alias_method :source_size, :src_size
104
+
105
+ def initialize(*args)
106
+ @source, @start, @size, @strand, @src_size, @text = args
107
+ end
108
+
109
+ # Whether this sequence is empty. Only true for {EmptySequence}
110
+ # instances from 'e' lines.
111
+ def empty?
112
+ false
113
+ end
114
+
115
+ def write_fasta(writer)
116
+ writer.write("#{source}:#{start}-#{start + size}",
117
+ text)
118
+ end
119
+ end
120
+
121
+ # An empty sequence record from an 'e' line.
122
+ #
123
+ # This indicates that "there isn't aligning DNA for a species but
124
+ # that the current block is bridged by a chain that connects
125
+ # blocks before and after this block" (MAF spec).
126
+ # @api public
127
+ class EmptySequence < Sequence
128
+ attr_reader :status
129
+
130
+ def initialize(*args)
131
+ super(*args[0..4])
132
+ @status = args[5]
133
+ end
134
+
135
+ def text
136
+ ''
137
+ end
138
+
139
+ def empty?
140
+ true
141
+ end
142
+
143
+ def write_fasta(writer)
144
+ raise "empty sequence output not implemented!"
145
+ end
146
+ end
147
+
148
+ # Reads MAF files in chunks.
149
+ # @api private
150
+ class ChunkReader
151
+ # Size, in bytes, of the chunks to read. Must be a power of 2.
152
+ # @return [Integer]
153
+ attr_accessor :chunk_size
154
+ # Current position in the file.
155
+ # @return [Integer]
156
+ attr_accessor :pos
157
+ # {File} from which chunks are read.
158
+ # @return [File]
159
+ attr_reader :f
160
+ def initialize(f, chunk_size)
161
+ @f = f
162
+ self.chunk_size = chunk_size
163
+ @pos = 0
164
+ end
165
+
166
+ def chunk_size=(size)
167
+ check_chunk_size(size)
168
+ @chunk_size = size
169
+ # power of 2 so don't worry about rounding
170
+ # @chunk_shift = Math.log2(size).to_i
171
+ end
172
+
173
+ def check_chunk_size(size)
174
+ if size < 1
175
+ raise "Invalid chunk size: #{size}"
176
+ end
177
+ ## test whether it is a power of 2
178
+ ## cf. http://bit.ly/JExNc4
179
+ if size & (size - 1) != 0
180
+ raise "Invalid chunk size (not a power of 2): #{size}}"
181
+ end
182
+ end
183
+
184
+ # Reads the next chunk of the file.
185
+ # @return [String] Next {#chunk_size} bytes of MAF data.
186
+ def read_chunk
187
+ chunk = f.read(@chunk_size)
188
+ @pos += chunk.bytesize if chunk
189
+ return chunk
190
+ end
191
+
192
+ # Reads a chunk of the file.
193
+ #
194
+ # Currently always reads size_hint bytes but this may change
195
+ # with BGZF support.
196
+ #
197
+ # @param [Integer] offset file offset to read from.
198
+ # @param [Integer] size_hint desired size of chunk.
199
+ # @return [String] Chunk of MAF data.
200
+ def read_chunk_at(offset, size_hint=@chunk_size)
201
+ f.seek(offset)
202
+ chunk = f.read(size_hint)
203
+ @pos = offset + chunk.bytesize
204
+ return chunk
205
+ end
206
+ end
207
+
208
+ # Variant ChunkReader using a read-ahead thread with internal
209
+ # queue for sequential parsing. Not useful for random-access
210
+ # parsing.
211
+ #
212
+ # Only beneficial on JRuby.
213
+ class ThreadedChunkReader < ChunkReader
214
+
215
+ def initialize(f, chunk_size, buffer_size=64)
216
+ super(f, chunk_size)
217
+ @buffer = SizedQueue.new(buffer_size)
218
+ @eof_reached = false
219
+ start_read_ahead
220
+ end
221
+
222
+ # Spawn a read-ahead thread. Called from {#initialize}.
223
+ def start_read_ahead
224
+ @read_thread = Thread.new { read_ahead }
225
+ end
226
+
227
+ # Read ahead into queue.
228
+ def read_ahead
229
+ # n = 0
230
+ begin
231
+ f_pos = 0
232
+ until f.eof?
233
+ chunk = f.read(@chunk_size)
234
+ @buffer << [f_pos, chunk]
235
+ f_pos += chunk.bytesize
236
+ # n += 1
237
+ # if (n % 100) == 0
238
+ # $stderr.puts "buffer size: #{@buffer.size}"
239
+ # end
240
+ end
241
+ @eof_reached = true
242
+ rescue Exception
243
+ @read_ahead_ex = $!
244
+ $stderr.puts "read_ahead aborting: #{$!}"
245
+ end
246
+ end
247
+
248
+ # (see ChunkReader#read_chunk)
249
+ def read_chunk
250
+ raise "readahead failed: #{@read_ahead_ex}" if @read_ahead_ex
251
+ if @eof_reached && @buffer.empty?
252
+ return nil
253
+ else
254
+ c_pos, chunk = @buffer.shift()
255
+ @pos = c_pos
256
+ return chunk
257
+ end
258
+ end
259
+
260
+ end
261
+
262
+ # MAF parsing code useful for sequential and random-access parsing.
263
+ module MAFParsing
264
+
265
+ BLOCK_START = /^(?=a)/
266
+ BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
267
+ EOL_OR_EOF = /\n|\z/
268
+
269
+ def set_last_block_pos!
270
+ @last_block_pos = s.string.rindex(BLOCK_START)
271
+ end
272
+
273
+ ## On finding the start of a block:
274
+ ## See whether we are at the last block in the chunk.
275
+ ## If at the last block:
276
+ ## If at EOF: last block.
277
+ ## If not:
278
+ ## Read the next chunk
279
+ ## Find the start of the next block in that chunk
280
+ ## Concatenate the two block fragments
281
+ ## Parse the resulting block
282
+ ## Promote the next scanner, positioned
283
+
284
+ # Parse the block at the current position, joining fragments
285
+ # across chunk boundaries if necessary.
286
+ #
287
+ # @return [Block] alignment block
288
+ # @api public
289
+ def parse_block
290
+ return nil if at_end
291
+ if s.pos != last_block_pos
292
+ # in non-trailing block
293
+ parse_block_data
294
+ else
295
+ # in trailing block fragment
296
+ parse_trailing_fragment
297
+ end
298
+ end
299
+
300
+ ## Read chunks and accumulate a leading fragment until we
301
+ ## encounter a block start or EOF.
302
+ def gather_leading_fragment
303
+ leading_frag = ''
304
+ while true
305
+ next_chunk_start = cr.pos
306
+ next_chunk = cr.read_chunk
307
+ if next_chunk
308
+ next_scanner = StringScanner.new(next_chunk)
309
+ # If this trailing fragment ends with a newline, then an
310
+ # 'a' at the beginning of the leading fragment is the
311
+ # start of the next alignment block.
312
+ if trailing_nl?(leading_frag) || trailing_nl?(s.string)
313
+ pat = BLOCK_START
314
+ else
315
+ pat = /(?:\n(?=a))/
316
+ end
317
+ frag = next_scanner.scan_until(pat)
318
+ if frag
319
+ # got block start
320
+ leading_frag << frag
321
+ break
322
+ else
323
+ # no block start in this
324
+ leading_frag << next_chunk
325
+ end
326
+ else
327
+ # EOF
328
+ @at_end = true
329
+ break
330
+ end
331
+ end
332
+ return leading_frag, next_scanner, next_chunk_start
333
+ end
334
+
335
+ # Join the trailing fragment of the current chunk with the
336
+ # leading fragment of the next chunk and parse the resulting
337
+ # block.
338
+ #
339
+ # @return [Block] the alignment block.
340
+
341
+ def parse_trailing_fragment
342
+ leading_frag, next_scanner, next_chunk_start = gather_leading_fragment
343
+ # join fragments and parse
344
+ trailing_frag = s.rest
345
+ joined_block = trailing_frag + leading_frag
346
+ @chunk_start = chunk_start + s.pos
347
+ @s = StringScanner.new(joined_block)
348
+ begin
349
+ block = parse_block_data
350
+ rescue ParseError => pe
351
+ parse_error "Could not parse joined fragments: #{pe}\nTRAILING: #{trailing_frag}\nLEADING: #{leading_frag}"
352
+ end
353
+ # Set up to parse the next block
354
+ @s = next_scanner
355
+ @chunk_start = next_chunk_start
356
+ unless @at_end
357
+ set_last_block_pos!
358
+ end
359
+ return block
360
+ end
361
+
362
+ # Raise a {ParseError}, indicating position within the MAF file
363
+ # and the chunk as well as the text surrounding the current
364
+ # scanner position.
365
+ #
366
+ # @param [String] msg the error message
367
+ def parse_error(msg)
368
+ s_start = [s.pos - 10, 0].max
369
+ s_end = [s.pos + 10, s.string.length].min
370
+ if s_start > 0
371
+ left = s.string[s_start..(s.pos - 1)]
372
+ else
373
+ left = ''
374
+ end
375
+ right = s.string[s.pos..s_end]
376
+ extra = "pos #{s.pos} [#{chunk_start + s.pos}], last #{last_block_pos}"
377
+
378
+ raise ParseError, "#{msg} at: '#{left}>><<#{right}' (#{extra})"
379
+ end
380
+
381
+ S = 's'.getbyte(0)
382
+ I = 'i'.getbyte(0)
383
+ E = 'e'.getbyte(0)
384
+ Q = 'q'.getbyte(0)
385
+ COMMENT = '#'.getbyte(0)
386
+
387
+ # Parse a {Block} from the current position. Requires that {#s}
388
+ # and {#chunk_start} be set correctly.
389
+ #
390
+ # @return [Block] the alignment block.
391
+ def parse_block_data
392
+ block_start_pos = s.pos
393
+ block_offset = chunk_start + block_start_pos
394
+ s.scan(/^a\s*/) || parse_error("bad a line")
395
+ block_vars = parse_maf_vars()
396
+ seqs = []
397
+ payload = s.scan_until(/^(?=a)/)
398
+ unless payload
399
+ payload = s.rest
400
+ s.pos = s.string.size # jump to EOS
401
+ end
402
+ lines = payload.split("\n")
403
+ until lines.empty?
404
+ line = lines.shift
405
+ first = line.getbyte(0)
406
+ if first == S
407
+ seq = parse_seq_line(line, sequence_filter)
408
+ seqs << seq if seq
409
+ elsif first == E && parse_empty
410
+ e_seq = parse_empty_line(line, sequence_filter)
411
+ seqs << e_seq if e_seq
412
+ elsif first == I && parse_extended
413
+ parts = line.split
414
+ parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
415
+ seqs.last.i_data = parts.slice(2..6)
416
+ elsif first == Q && parse_extended
417
+ _, src, quality = line.split
418
+ parse_error("wrong q source #{src}!") unless seqs.last.source == src
419
+ seqs.last.quality = quality
420
+ elsif [I, E, Q, COMMENT, nil].include? first
421
+ next
422
+ else
423
+ parse_error "unexpected line: '#{line}'"
424
+ end
425
+ end
426
+ return Block.new(block_vars,
427
+ seqs,
428
+ block_offset,
429
+ s.pos - block_start_pos)
430
+ end
431
+
432
+ # Parse an 's' line.
433
+ # @return [Sequence]
434
+ def parse_seq_line(line, filter)
435
+ _, src, start, size, strand, src_size, text = line.split
436
+ return nil if filter && ! seq_filter_ok?(src, filter)
437
+ begin
438
+ Sequence.new(src,
439
+ start.to_i,
440
+ size.to_i,
441
+ STRAND_SYM.fetch(strand),
442
+ src_size.to_i,
443
+ text)
444
+ rescue KeyError
445
+ parse_error "invalid sequence line: #{line}"
446
+ end
447
+ end
448
+
449
+ # Parse an 'e' line.
450
+ # @return [EmptySequence]
451
+ def parse_empty_line(line, filter)
452
+ _, src, start, size, strand, src_size, status = line.split
453
+ return nil if filter && ! seq_filter_ok?(src, filter)
454
+ begin
455
+ EmptySequence.new(src,
456
+ start.to_i,
457
+ size.to_i,
458
+ STRAND_SYM.fetch(strand),
459
+ src_size.to_i,
460
+ status)
461
+ rescue KeyError
462
+ parse_error "invalid empty sequence line: #{line}"
463
+ end
464
+ end
465
+
466
+ # Indicates whether the given sequence source should be parsed,
467
+ # given the current sequence filters.
468
+ def seq_filter_ok?(src, filter)
469
+ if filter[:only_species]
470
+ src_sp = src.split('.', 2)[0]
471
+ m = filter[:only_species].find { |sp| src_sp == sp }
472
+ return m
473
+ else
474
+ return true
475
+ end
476
+ end
477
+
478
+ # Parse key-value pairs from the MAF header or an 'a' line.
479
+ # @return [Hash]
480
+ def parse_maf_vars
481
+ vars = {}
482
+ while s.scan(/(\w+)=(\S*)\s+/) do
483
+ vars[s[1].to_sym] = s[2]
484
+ end
485
+ vars
486
+ end
487
+
488
+ # Does `string` have a trailing newline?
489
+ def trailing_nl?(string)
490
+ if string.empty?
491
+ false
492
+ else
493
+ s.string[s.string.size - 1] == "\n"
494
+ end
495
+ end
496
+
497
+ STRAND_SYM = {
498
+ '+' => :+,
499
+ '-' => :-
500
+ }
501
+ end
502
+
503
+ # A MAF parsing context, used for random-access parsing.
504
+ class ParseContext
505
+ include MAFParsing
506
+ attr_accessor :f, :s, :cr, :parser
507
+ attr_accessor :chunk_start, :last_block_pos, :at_end
508
+
509
+ def initialize(fd, chunk_size, parser, opts)
510
+ @f = fd
511
+ @parser = parser
512
+ reader = opts[:chunk_reader] || ChunkReader
513
+ @cr = reader.new(@f, chunk_size)
514
+ @last_block_pos = -1
515
+ end
516
+
517
+ def sequence_filter
518
+ parser.sequence_filter
519
+ end
520
+
521
+ def parse_empty
522
+ parser.parse_empty
523
+ end
524
+
525
+ def parse_extended
526
+ parser.parse_extended
527
+ end
528
+
529
+ def set_last_block_pos!
530
+ @last_block_pos = s.string.rindex(BLOCK_START)
531
+ end
532
+
533
+ # Fetch and parse blocks at given `offset` and `len`
534
+ # @param [Integer] offset Offset to start parsing at.
535
+ # @param [Integer] len Number of bytes to read.
536
+ # @param [Array] block_offsets Offsets of blocks to parse.
537
+ # @return [Array<Block>]
538
+ def fetch_blocks(offset, len, block_offsets)
539
+ start_chunk_read_if_needed(offset, len)
540
+ # read chunks until we have the entire merged set of
541
+ # blocks ready to parse
542
+ # to avoid fragment joining
543
+ append_chunks_to(len)
544
+ # parse the blocks
545
+ Enumerator.new do |y|
546
+ block_offsets.each do |expected_offset|
547
+ block = parse_block
548
+ ctx.parse_error("expected a block at offset #{expected_offset} but could not parse one!") unless block
549
+ ctx.parse_error("got block with offset #{block.offset}, expected #{expected_offset}!") unless block.offset == expected_offset
550
+ y << block
551
+ end
552
+ end
553
+ end
554
+
555
+ def start_chunk_read_if_needed(offset, len)
556
+ if chunk_start \
557
+ && (chunk_start <= offset) \
558
+ && (offset < (chunk_start + s.string.size))
559
+ ## the selected offset is in the current chunk
560
+ s.pos = offset - chunk_start
561
+ else
562
+ chunk = cr.read_chunk_at(offset, len)
563
+ @chunk_start = offset
564
+ @s = StringScanner.new(chunk)
565
+ end
566
+ end
567
+
568
+ def append_chunks_to(len)
569
+ # XXX: need to rethink this for BGZF; prefetching ChunkReader
570
+ while s.string.size < len
571
+ s.string << cr.read_chunk()
572
+ end
573
+ end
574
+
575
+ end
576
+
577
+ # MAF parser, used for sequential and random-access parsing.
578
+ #
579
+ # Options:
580
+ #
581
+ # * `:parse_extended`: whether to parse 'i' and 'q' lines
582
+ # * `:parse_empty`: whether to parse 'e' lines
583
+ # * `:chunk_size`: read MAF file in chunks of this many bytes
584
+ # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
585
+ # * `:merge_max`: merge up to this many bytes of blocks for
586
+ # random access
587
+ # * `:chunk_reader`: use the specified class to read
588
+ # chunks. (Only useful with {ThreadedChunkReader}).
589
+ # * `:threads`: number of threads to use for parallel
590
+ # parsing. Only useful under JRuby.
591
+ # @api public
592
+
593
+ class Parser
594
+ include MAFParsing
595
+
596
+ # @return [Header] header of the MAF file being parsed.
597
+ attr_reader :header
598
+ # @return [String] path of MAF file being parsed.
599
+ attr_reader :file_spec
600
+ # @return [File] file handle for MAF file.
601
+ attr_reader :f
602
+ # @return [StringScanner] scanner for parsing.
603
+ attr_reader :s
604
+ # @return [ChunkReader] ChunkReader.
605
+ attr_reader :cr
606
+ # @return [Boolean] whether EOF has been reached.
607
+ attr_reader :at_end
608
+ # @return [Hash] parser options.
609
+ attr_reader :opts
610
+ # @return [Integer] starting offset of the current chunk.
611
+ attr_reader :chunk_start
612
+ # @return [Integer] offset of the last block start in this chunk.
613
+ attr_reader :last_block_pos
614
+ # Sequence filter to apply.
615
+ # @api public
616
+ attr_accessor :sequence_filter
617
+
618
+ # @api private
619
+ attr_accessor :parse_extended
620
+ attr_accessor :parse_empty
621
+
622
+ SEQ_CHUNK_SIZE = 131072
623
+ RANDOM_CHUNK_SIZE = 4096
624
+ MERGE_MAX = SEQ_CHUNK_SIZE
625
+
626
+ # Create a new parser instance.
627
+ #
628
+ # @param [String] file_spec path of file to parse.
629
+ # @param [Hash] opts parser options.
630
+ # @api public
631
+ def initialize(file_spec, opts={})
632
+ @opts = opts
633
+ chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
634
+ @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
635
+ @merge_max = opts[:merge_max] || MERGE_MAX
636
+ @parse_extended = opts[:parse_extended] || false
637
+ @parse_empty = opts[:parse_empty] || false
638
+ @chunk_start = 0
639
+ @file_spec = file_spec
640
+ @f = File.open(file_spec)
641
+ reader = opts[:chunk_reader] || ChunkReader
642
+ @cr = reader.new(@f, chunk_size)
643
+ @s = StringScanner.new(cr.read_chunk())
644
+ set_last_block_pos!
645
+ @at_end = false
646
+ _parse_header()
647
+ end
648
+
649
+ # Create a {ParseContext} for random access, using the given
650
+ # chunk size.
651
+ #
652
+ # @return [ParseContext]
653
+ # @api private
654
+ def context(chunk_size)
655
+ # IO#dup calls dup(2) internally, but seems broken on JRuby...
656
+ fd = File.open(file_spec)
657
+ ParseContext.new(fd, chunk_size, self, @opts)
658
+ end
659
+
660
+ # Execute the given block with a {ParseContext} using the given
661
+ # `chunk_size` as an argument.
662
+ #
663
+ # @see #context
664
+ # @api private
665
+ def with_context(chunk_size)
666
+ ctx = context(chunk_size)
667
+ begin
668
+ yield ctx
669
+ ensure
670
+ ctx.f.close
671
+ end
672
+ end
673
+
674
+ # Fetch and parse blocks given by `fetch_list`.
675
+ #
676
+ # `fetch_list` should be an array of `[offset, length]` tuples.
677
+ #
678
+ # @param [Array] fetch_list the fetch list
679
+ # @return [Array<Block>] the requested alignment blocks
680
+ def fetch_blocks(fetch_list)
681
+ merged = merge_fetch_list(fetch_list)
682
+ if RUBY_PLATFORM == 'java' && @opts.fetch(:threads, 1) > 1
683
+ fetch_blocks_merged_parallel(merged)
684
+ else
685
+ fetch_blocks_merged(merged)
686
+ end
687
+ end
688
+
689
+ # Fetch and parse the blocks given by the merged fetch list.
690
+ #
691
+ # @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
692
+ # @return [Array<Block>] the requested alignment blocks
693
+ def fetch_blocks_merged(fetch_list)
694
+ Enumerator.new do |y|
695
+ start = Time.now
696
+ total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
697
+ with_context(@random_access_chunk_size) do |ctx|
698
+ fetch_list.each do |e|
699
+ ctx.fetch_blocks(*e).each do |block|
700
+ y << block
701
+ #total_size += block.size
702
+ end
703
+ end
704
+ end
705
+ elapsed = Time.now - start
706
+ rate = (total_size / 1048576.0) / elapsed
707
+ $stderr.printf("Fetched blocks in %.3fs, %.1f MB/s.\n",
708
+ elapsed, rate)
709
+ end
710
+ end
711
+
712
+ # Fetch and parse the blocks given by the merged fetch list, in
713
+ # parallel. Uses the number of threads specified by the
714
+ # `:threads` parser option.
715
+ #
716
+ # @param [Array] fetch_list merged fetch list from {#merge_fetch_list}.
717
+ # @return [Array<Block>] the requested alignment blocks
718
+ def fetch_blocks_merged_parallel(fetch_list)
719
+ Enumerator.new do |y|
720
+ total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
721
+ start = Time.now
722
+ n_threads = @opts.fetch(:threads, 1)
723
+ # TODO: break entries up into longer runs for more
724
+ # sequential I/O
725
+ jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
726
+ completed = java.util.concurrent.LinkedBlockingQueue.new(128)
727
+ threads = []
728
+ n_threads.times { threads << make_worker(jobs, completed) }
729
+
730
+ n_completed = 0
731
+ while (n_completed < fetch_list.size)
732
+ c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
733
+ if c.nil?
734
+ if threads.find { |t| t.alive? }
735
+ next
736
+ else
737
+ raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
738
+ end
739
+ end
740
+ raise "worker failed: #{c}" if c.is_a? Exception
741
+ c.each do |block|
742
+ y << block
743
+ end
744
+ n_completed += 1
745
+ end
746
+ threads.each { |t| t.join }
747
+ elapsed = Time.now - start
748
+ $stderr.printf("Fetched blocks from %d threads in %.1fs.\n",
749
+ n_threads,
750
+ elapsed)
751
+ mb = total_size / 1048576.0
752
+ $stderr.printf("%.3f MB processed (%.1f MB/s).\n",
753
+ mb,
754
+ mb / elapsed)
755
+ end
756
+ end
757
+
758
+ # Create a worker thread for parallel parsing.
759
+ #
760
+ # @see #fetch_blocks_merged_parallel
761
+ def make_worker(jobs, completed)
762
+ Thread.new do
763
+ with_context(@random_access_chunk_size) do |ctx|
764
+ while true
765
+ req = jobs.poll
766
+ break unless req
767
+ begin
768
+ n_blocks = req[2].size
769
+ blocks = ctx.fetch_blocks(*req).to_a
770
+ if blocks.size != n_blocks
771
+ raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
772
+ end
773
+ completed.put(blocks)
774
+ rescue Exception => e
775
+ completed.put(e)
776
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
777
+ $stderr.puts e.backtrace.join("\n")
778
+ raise e
779
+ end
780
+ end
781
+ end
782
+ end
783
+ end
784
+
785
+ # Merge contiguous blocks in the given fetch list, up to
786
+ # `:merge_max` bytes.
787
+ #
788
+ # Returns `[offset, size, [offset1, offset2, ...]]` tuples.
789
+ def merge_fetch_list(orig_fl)
790
+ fl = orig_fl.dup
791
+ r = []
792
+ until fl.empty? do
793
+ cur = fl.shift
794
+ if r.last \
795
+ && (r.last[0] + r.last[1]) == cur[0] \
796
+ && (r.last[1] + cur[1]) <= @merge_max
797
+ # contiguous with the previous one
798
+ # add to length and increment count
799
+ r.last[1] += cur[1]
800
+ r.last[2] << cur[0]
801
+ else
802
+ cur << [cur[0]]
803
+ r << cur
804
+ end
805
+ end
806
+ return r
807
+ end
808
+
809
+ # Parse the header of the MAF file.
810
+ def _parse_header
811
+ parse_error("not a MAF file") unless s.scan(/##maf\s*/)
812
+ vars = parse_maf_vars()
813
+ align_params = nil
814
+ while s.scan(/^#\s*(.+?)\n/)
815
+ if align_params == nil
816
+ align_params = s[1]
817
+ else
818
+ align_params << ' ' << s[1]
819
+ end
820
+ end
821
+ @header = Header.new(vars, align_params)
822
+ s.skip_until BLOCK_START || parse_error("Cannot find block start!")
823
+ end
824
+
825
+ # Parse all alignment blocks until EOF.
826
+ #
827
+ # Delegates to {#parse_blocks_parallel} if `:threads` is set
828
+ # under JRuby.
829
+ #
830
+ # @return [Enumerator<Block>] enumerator of alignment blocks.
831
+ # @api public
832
+ def parse_blocks
833
+ if RUBY_PLATFORM == 'java' && @opts.has_key?(:threads)
834
+ parse_blocks_parallel
835
+ else
836
+ Enumerator.new do |y|
837
+ until at_end
838
+ y << parse_block()
839
+ end
840
+ end
841
+ end
842
+ end
843
+
844
+ # Parse alignment blocks with a worker thread.
845
+ #
846
+ # @return [Enumerator<Block>] enumerator of alignment blocks.
847
+ # @api private
848
+ def parse_blocks_parallel
849
+ queue = java.util.concurrent.LinkedBlockingQueue.new(128)
850
+ worker = Thread.new do
851
+ begin
852
+ until at_end
853
+ queue.put(parse_block())
854
+ end
855
+ queue.put(:eof)
856
+ rescue
857
+ $stderr.puts "worker exiting: #{$!.class}: #{$!}"
858
+ $stderr.puts $!.backtrace.join("\n")
859
+ end
860
+ end
861
+ Enumerator.new do |y|
862
+ saw_eof = false
863
+ while worker.alive?
864
+ block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
865
+ if block == :eof
866
+ saw_eof = true
867
+ break
868
+ elsif block
869
+ y << block
870
+ end
871
+ end
872
+ unless saw_eof
873
+ raise "worker exited unexpectedly!"
874
+ end
875
+ end
876
+ end
877
+
878
+ def each_block
879
+ until at_end
880
+ yield parse_block()
881
+ end
882
+ end
883
+
884
+ end
885
+
886
+ end
887
+
888
+ end