bio-maf 0.1.0-java → 0.2.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,142 +9,6 @@ module Bio
9
9
  # @api public
10
10
  class ParseError < Exception; end
11
11
 
12
- # A MAF header, containing the variable-value pairs from the first
13
- # line of the file as well as the alignment parameters.
14
- # @api public
15
- class Header
16
- # Variable-value pairs from the ##maf line
17
- # @return [Hash]
18
- attr_accessor :vars
19
- # Alignment parameters from the MAF header.
20
- # @return [Hash]
21
- attr_accessor :alignment_params
22
-
23
- def initialize(vars, params)
24
- @vars = vars
25
- @alignment_params = params
26
- end
27
-
28
- # The required version parameter.
29
- # @return [String]
30
- def version
31
- vars[:version]
32
- end
33
-
34
- # The optional scoring parameter, if present.
35
- # @return [String]
36
- def scoring
37
- vars[:scoring]
38
- end
39
-
40
- end
41
-
42
- # A MAF alignment block.
43
- # @api public
44
- class Block
45
- # Parameters from the 'a' line starting the alignment block.
46
- attr_reader :vars
47
- # Sequences, one per 's' or 'e' line.
48
- # @return [Array<Sequence>]
49
- attr_reader :sequences
50
- # Offset of the alignment block within the MAF file, in bytes.
51
- # @return [Integer]
52
- attr_reader :offset
53
- # Size of the alignment block within the MAF file, in bytes.
54
- # @return [Integer]
55
- attr_reader :size
56
-
57
- def initialize(*args)
58
- @vars, @sequences, @offset, @size = args
59
- end
60
-
61
- def raw_seq(i)
62
- sequences.fetch(i)
63
- end
64
-
65
- def each_raw_seq
66
- sequences.each { |s| yield s }
67
- end
68
-
69
- # Text size of the alignment block. This is the number of text
70
- # characters in each line of sequence data, including dashes and
71
- # other gaps in the sequence.
72
- def text_size
73
- sequences.first.text.size
74
- end
75
-
76
- end
77
-
78
- # A sequence within an alignment block.
79
- # @api public
80
- class Sequence
81
- # @return [String] Source sequence name.
82
- attr_reader :source
83
- # @return [Integer] Zero-based start position.
84
- attr_reader :start
85
- # @return [Integer] Size of aligning region in source sequence.
86
- attr_reader :size
87
- # :+ or :-, indicating which strand the alignment is to.
88
- # @return [Symbol]
89
- attr_reader :strand
90
- # Size of the entire source sequence, not just the aligning
91
- # region.
92
- # @return [Integer]
93
- attr_reader :src_size
94
- # Sequence data for the alignment, including insertions.
95
- # @return [String]
96
- attr_reader :text
97
- # Array of raw synteny information from 'i' line.
98
- # @return [Array<String>]
99
- attr_accessor :i_data
100
- # Quality string from 'q' line.
101
- # @return [String]
102
- attr_accessor :quality
103
- alias_method :source_size, :src_size
104
-
105
- def initialize(*args)
106
- @source, @start, @size, @strand, @src_size, @text = args
107
- end
108
-
109
- # Whether this sequence is empty. Only true for {EmptySequence}
110
- # instances from 'e' lines.
111
- def empty?
112
- false
113
- end
114
-
115
- def write_fasta(writer)
116
- writer.write("#{source}:#{start}-#{start + size}",
117
- text)
118
- end
119
- end
120
-
121
- # An empty sequence record from an 'e' line.
122
- #
123
- # This indicates that "there isn't aligning DNA for a species but
124
- # that the current block is bridged by a chain that connects
125
- # blocks before and after this block" (MAF spec).
126
- # @api public
127
- class EmptySequence < Sequence
128
- attr_reader :status
129
-
130
- def initialize(*args)
131
- super(*args[0..4])
132
- @status = args[5]
133
- end
134
-
135
- def text
136
- ''
137
- end
138
-
139
- def empty?
140
- true
141
- end
142
-
143
- def write_fasta(writer)
144
- raise "empty sequence output not implemented!"
145
- end
146
- end
147
-
148
12
  # Reads MAF files in chunks.
149
13
  # @api private
150
14
  class ChunkReader
@@ -399,16 +263,25 @@ module Bio
399
263
  payload = s.rest
400
264
  s.pos = s.string.size # jump to EOS
401
265
  end
266
+ filtered = false
402
267
  lines = payload.split("\n")
403
268
  until lines.empty?
404
269
  line = lines.shift
405
270
  first = line.getbyte(0)
406
271
  if first == S
407
272
  seq = parse_seq_line(line, sequence_filter)
408
- seqs << seq if seq
273
+ if seq
274
+ seqs << seq
275
+ else
276
+ filtered = true
277
+ end
409
278
  elsif first == E && parse_empty
410
279
  e_seq = parse_empty_line(line, sequence_filter)
411
- seqs << e_seq if e_seq
280
+ if e_seq
281
+ seqs << e_seq
282
+ else
283
+ filtered = true
284
+ end
412
285
  elsif first == I && parse_extended
413
286
  parts = line.split
414
287
  parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
@@ -423,10 +296,19 @@ module Bio
423
296
  parse_error "unexpected line: '#{line}'"
424
297
  end
425
298
  end
426
- return Block.new(block_vars,
427
- seqs,
428
- block_offset,
429
- s.pos - block_start_pos)
299
+ block = Block.new(block_vars,
300
+ seqs,
301
+ block_offset,
302
+ s.pos - block_start_pos,
303
+ filtered)
304
+ postprocess_block(block)
305
+ end
306
+
307
+ def postprocess_block(block)
308
+ if block.filtered? && opts[:remove_gaps]
309
+ block.remove_gaps!
310
+ end
311
+ block
430
312
  end
431
313
 
432
314
  # Parse an 's' line.
@@ -503,12 +385,13 @@ module Bio
503
385
  # A MAF parsing context, used for random-access parsing.
504
386
  class ParseContext
505
387
  include MAFParsing
506
- attr_accessor :f, :s, :cr, :parser
388
+ attr_accessor :f, :s, :cr, :parser, :opts
507
389
  attr_accessor :chunk_start, :last_block_pos, :at_end
508
390
 
509
- def initialize(fd, chunk_size, parser, opts)
391
+ def initialize(fd, chunk_size, parser)
510
392
  @f = fd
511
393
  @parser = parser
394
+ @opts = parser.opts
512
395
  reader = opts[:chunk_reader] || ChunkReader
513
396
  @cr = reader.new(@f, chunk_size)
514
397
  @last_block_pos = -1
@@ -580,6 +463,7 @@ module Bio
580
463
  #
581
464
  # * `:parse_extended`: whether to parse 'i' and 'q' lines
582
465
  # * `:parse_empty`: whether to parse 'e' lines
466
+ # * `:remove_gaps`: remove gaps left after filtering sequences
583
467
  # * `:chunk_size`: read MAF file in chunks of this many bytes
584
468
  # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
585
469
  # * `:merge_max`: merge up to this many bytes of blocks for
@@ -611,9 +495,6 @@ module Bio
611
495
  attr_reader :chunk_start
612
496
  # @return [Integer] offset of the last block start in this chunk.
613
497
  attr_reader :last_block_pos
614
- # Sequence filter to apply.
615
- # @api public
616
- attr_accessor :sequence_filter
617
498
 
618
499
  # @api private
619
500
  attr_accessor :parse_extended
@@ -630,6 +511,9 @@ module Bio
630
511
  # @api public
631
512
  def initialize(file_spec, opts={})
632
513
  @opts = opts
514
+ if RUBY_PLATFORM == 'java'
515
+ opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
516
+ end
633
517
  chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
634
518
  @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
635
519
  @merge_max = opts[:merge_max] || MERGE_MAX
@@ -654,7 +538,7 @@ module Bio
654
538
  def context(chunk_size)
655
539
  # IO#dup calls dup(2) internally, but seems broken on JRuby...
656
540
  fd = File.open(file_spec)
657
- ParseContext.new(fd, chunk_size, self, @opts)
541
+ ParseContext.new(fd, chunk_size, self)
658
542
  end
659
543
 
660
544
  # Execute the given block with a {ParseContext} using the given
@@ -671,6 +555,20 @@ module Bio
671
555
  end
672
556
  end
673
557
 
558
+ # Sequence filter to apply.
559
+ # @api public
560
+ # @return [Hash]
561
+ def sequence_filter
562
+ @sequence_filter ||= {}
563
+ end
564
+
565
+ # Set the sequence filter.
566
+ # @api public
567
+ # @param [Hash] filter the new filter
568
+ def sequence_filter=(filter)
569
+ @sequence_filter = filter
570
+ end
571
+
674
572
  # Fetch and parse blocks given by `fetch_list`.
675
573
  #
676
574
  # `fetch_list` should be an array of `[offset, length]` tuples.
@@ -723,25 +621,22 @@ module Bio
723
621
  # TODO: break entries up into longer runs for more
724
622
  # sequential I/O
725
623
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
726
- completed = java.util.concurrent.LinkedBlockingQueue.new(128)
624
+ ct = CompletionTracker.new(fetch_list)
625
+ completed = ct.queue
727
626
  threads = []
728
- n_threads.times { threads << make_worker(jobs, completed) }
729
-
730
- n_completed = 0
731
- while (n_completed < fetch_list.size)
732
- c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
733
- if c.nil?
734
- if threads.find { |t| t.alive? }
735
- next
736
- else
737
- raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
738
- end
627
+ n_threads.times { threads << make_worker(jobs, ct) }
628
+
629
+ n_res = 0
630
+ while n_res < fetch_list.size
631
+ c = completed.poll(1, java.util.concurrent.TimeUnit::SECONDS)
632
+ unless c
633
+ raise "Worker failed!" if threads.find { |t| t.status.nil? }
634
+ next
739
635
  end
740
- raise "worker failed: #{c}" if c.is_a? Exception
741
636
  c.each do |block|
742
637
  y << block
743
638
  end
744
- n_completed += 1
639
+ n_res += 1
745
640
  end
746
641
  threads.each { |t| t.join }
747
642
  elapsed = Time.now - start
@@ -758,26 +653,25 @@ module Bio
758
653
  # Create a worker thread for parallel parsing.
759
654
  #
760
655
  # @see #fetch_blocks_merged_parallel
761
- def make_worker(jobs, completed)
656
+ def make_worker(jobs, ct)
762
657
  Thread.new do
763
- with_context(@random_access_chunk_size) do |ctx|
764
- while true
765
- req = jobs.poll
766
- break unless req
767
- begin
658
+ begin
659
+ with_context(@random_access_chunk_size) do |ctx|
660
+ while true
661
+ req = jobs.poll
662
+ break unless req
768
663
  n_blocks = req[2].size
769
664
  blocks = ctx.fetch_blocks(*req).to_a
770
665
  if blocks.size != n_blocks
771
666
  raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
772
667
  end
773
- completed.put(blocks)
774
- rescue Exception => e
775
- completed.put(e)
776
- $stderr.puts "Worker failing: #{e.class}: #{e}"
777
- $stderr.puts e.backtrace.join("\n")
778
- raise e
668
+ ct << blocks
779
669
  end
780
670
  end
671
+ rescue Exception => e
672
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
673
+ $stderr.puts e.backtrace.join("\n")
674
+ raise e
781
675
  end
782
676
  end
783
677
  end
@@ -860,14 +754,19 @@ module Bio
860
754
  end
861
755
  Enumerator.new do |y|
862
756
  saw_eof = false
863
- while worker.alive?
757
+ n_final_poll = 0
758
+ while true
864
759
  block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
865
760
  if block == :eof
866
761
  saw_eof = true
867
762
  break
868
763
  elsif block
869
764
  y << block
765
+ else
766
+ # timed out
767
+ n_final_poll += 1 unless worker.alive?
870
768
  end
769
+ break if n_final_poll > 1
871
770
  end
872
771
  unless saw_eof
873
772
  raise "worker exited unexpectedly!"
@@ -883,6 +782,47 @@ module Bio
883
782
 
884
783
  end
885
784
 
785
+ class CompletionTracker
786
+ attr_reader :queue, :offsets, :delayed
787
+
788
+ def initialize(fetch_list)
789
+ @offsets = fetch_list.collect { |e| e[0] }
790
+ @queue = java.util.concurrent.LinkedBlockingQueue.new(128)
791
+ @delayed = {}
792
+ @sem = Mutex.new
793
+ end
794
+
795
+ def next_expected
796
+ offsets.first
797
+ end
798
+
799
+ def <<(blocks)
800
+ @sem.synchronize do
801
+ f_offset = blocks.first.offset
802
+ if f_offset == next_expected
803
+ offsets.shift
804
+ queue.put(blocks)
805
+ drain_delayed
806
+ else
807
+ # out of order
808
+ delayed[f_offset] = blocks
809
+ end
810
+ end
811
+ end
812
+
813
+ def drain_delayed
814
+ while e = delayed.delete(next_expected)
815
+ offsets.shift
816
+ queue.put(e)
817
+ end
818
+ end
819
+ end
820
+
821
+ # Exposes parser internals for unit tests.
822
+ class DummyParser
823
+ include MAFParsing
824
+ end
825
+
886
826
  end
887
827
 
888
828
  end
@@ -0,0 +1,167 @@
1
+ require 'zlib'
2
+
3
+ module Bio::MAF
4
+
5
+ # Tiles a given genomic interval.
6
+ # Inspired by: lib/bx/align/tools/tile.py in bx-python
7
+
8
+ class Tiler
9
+
10
+ attr_accessor :index
11
+ attr_accessor :parser
12
+ attr_accessor :reference
13
+ # GenomicInterval
14
+ attr_accessor :interval
15
+ attr_accessor :species
16
+ attr_accessor :species_map
17
+
18
+ def initialize
19
+ @species_map = {}
20
+ end
21
+
22
+ def ref_data(range)
23
+ if reference
24
+ if reference.respond_to? :read_interval
25
+ reference.read_interval(range.begin, range.end)
26
+ elsif reference.is_a? String
27
+ reference.slice(range)
28
+ else
29
+ raise "Unhandled reference data source: #{reference}"
30
+ end
31
+ else
32
+ nil
33
+ end
34
+ end
35
+
36
+ def tile
37
+ parser.sequence_filter[:only_species] = @species
38
+ # TODO: remove gaps
39
+ blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
40
+ mask = Array.new(interval.length, :ref)
41
+ i_start = interval.zero_start
42
+ i_end = interval.zero_end
43
+ if reference
44
+ ref_region = ref_data(i_start...i_end)
45
+ end
46
+ blocks.each do |block|
47
+ ref = block.ref_seq
48
+ slice_start = [i_start, ref.start].max
49
+ slice_end = [i_end, ref.end].min
50
+ mask.fill(block,
51
+ (slice_start - i_start)...(slice_end - i_start))
52
+ end
53
+ text = []
54
+ species.each { |s| text << '' }
55
+ nonref_text = text[1...text.size]
56
+ runs(mask) do |range, block|
57
+ g_range = (range.begin + i_start)...(range.end + i_start)
58
+ if block == :ref
59
+ # not covered by an alignment block
60
+ # use the reference sequence if given, otherwise 'N'
61
+ range_size = range.end - range.begin
62
+ text[0] << if ref_region
63
+ ref_region.slice(range)
64
+ else
65
+ 'N' * range_size
66
+ end
67
+ stars = '*' * range_size
68
+ nonref_text.each { |t| t << stars }
69
+ else
70
+ # covered by an alignment block
71
+ t_range = block.ref_seq.text_range(g_range)
72
+ species.each_with_index do |species, i|
73
+ sp_text = text[i]
74
+ seq = block.sequences.find { |s| s.source == species || s.species == species }
75
+ if seq
76
+ # got alignment text
77
+ sp_text << seq.text.slice(t_range)
78
+ else
79
+ # no alignment for this one here, use '*'
80
+ sp_text << '*' * (t_range.end - t_range.begin)
81
+ end
82
+ end
83
+ end
84
+ end
85
+ text
86
+ end
87
+
88
+ def write_fasta(f)
89
+ species.zip(tile()) do |species, text|
90
+ sp_out = species_map[species] || species
91
+ f.puts ">#{sp_out}"
92
+ f.puts text
93
+ end
94
+ end
95
+
96
+ def runs(mask)
97
+ cur = nil
98
+ cur_start = nil
99
+ mask.each_with_index do |obj, i|
100
+ if ! cur.equal?(obj)
101
+ yield(cur_start...i, cur) if cur
102
+ cur = obj
103
+ cur_start = i
104
+ end
105
+ end
106
+ yield(cur_start...mask.size, cur)
107
+ end
108
+
109
+ end
110
+
111
+ class FASTARangeReader
112
+ attr_reader :f, :pos
113
+
114
+ def initialize(fspec)
115
+ if fspec.respond_to? :seek
116
+ @f = fspec
117
+ else
118
+ reader_class = if fspec =~ /.gz$/
119
+ Zlib::GzipReader
120
+ else
121
+ File
122
+ end
123
+ @f = reader_class.open(fspec)
124
+ end
125
+ position_at_start
126
+ end
127
+
128
+ GT = '>'.getbyte(0)
129
+
130
+ def position_at_start
131
+ first = f.readline
132
+ raise "expected FASTA comment" unless first =~ /^>/
133
+ @pos = 0
134
+ end
135
+
136
+ def read_interval(z_start, z_end)
137
+ if z_start < pos
138
+ position_at_start
139
+ end
140
+ data = ''
141
+ region_size = z_end - z_start
142
+ in_region = false
143
+ f.each_line do |line_raw|
144
+ if line_raw.getbyte(0) == GT
145
+ raise "unexpected description line: #{line_raw.inspect}"
146
+ end
147
+ line = line_raw.strip
148
+ end_pos = pos + line.size
149
+ if (! in_region) && pos <= z_start && z_start < end_pos
150
+ data << line.slice((z_start - pos)...(line.size))
151
+ in_region = true
152
+ elsif in_region
153
+ need = region_size - data.size
154
+ if need > line.size
155
+ data << line
156
+ else
157
+ # last line
158
+ data << line.slice(0, need)
159
+ break
160
+ end
161
+ end
162
+ @pos = end_pos
163
+ end
164
+ return data
165
+ end
166
+ end
167
+ end