bio-maf 1.0.0-java → 1.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ require 'set'
2
+ require 'java' if RUBY_PLATFORM == 'java'
3
+
4
+ module Bio::MAF
5
+
6
+ module JobRunner
7
+ def JobRunner.create(n_parallel)
8
+ if RUBY_PLATFORM == 'java'
9
+ JThreadRunner.new(n_parallel)
10
+ else
11
+ ForkRunner.new(n_parallel)
12
+ end
13
+ end
14
+ end
15
+
16
+ class ForkRunner
17
+
18
+ def initialize(n_parallel)
19
+ @n_parallel = n_parallel
20
+ @jobs = []
21
+ @kids = Set.new
22
+ end
23
+
24
+ def add(&proc)
25
+ @jobs << proc
26
+ end
27
+
28
+ def run
29
+ until @jobs.empty? && @kids.empty?
30
+ while can_start?
31
+ start_job
32
+ end
33
+ await
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def can_start?
40
+ (! @jobs.empty?) && @kids.size < @n_parallel
41
+ end
42
+
43
+ def start_job
44
+ job = @jobs.shift
45
+ pid = fork()
46
+ if pid
47
+ # parent
48
+ @kids << pid
49
+ else
50
+ # child
51
+ begin
52
+ job.call()
53
+ exit 0
54
+ rescue SystemExit
55
+ raise
56
+ rescue Exception
57
+ LOG.error $!
58
+ exit 1
59
+ end
60
+ end
61
+ end
62
+
63
+ def await
64
+ pid = Process.wait
65
+ unless @kids.delete?(pid)
66
+ raise "Completion of unexpected job #{pid}!"
67
+ end
68
+ if ! $?.success?
69
+ raise "Job #{pid} failed with status #{status.exitstatus}!"
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ class JThreadRunner
76
+
77
+ def initialize(n_parallel)
78
+ @n_parallel = n_parallel
79
+ @exec = java.util.concurrent.Executors.newFixedThreadPool(n_parallel)
80
+ @ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
81
+ @n = 0
82
+ end
83
+
84
+ def add(&blk)
85
+ @ecs.submit(&blk)
86
+ @n += 1
87
+ end
88
+
89
+ def run
90
+ seen = 0
91
+ until seen == @n
92
+ f = @ecs.take()
93
+ begin
94
+ f.get()
95
+ rescue Exception => e
96
+ LOG.error e
97
+ @exec.shutdownNow()
98
+ raise
99
+ end
100
+ seen += 1
101
+ end
102
+ @exec.shutdown()
103
+ end
104
+
105
+ end
106
+
107
+ module Executor
108
+ def Executor.create
109
+ if RUBY_PLATFORM == 'java'
110
+ JExecutor.new
111
+ else
112
+ DummyExecutor.new
113
+ end
114
+ end
115
+ end
116
+
117
+ class JExecutor
118
+
119
+ def initialize
120
+ queue = java.util.concurrent.LinkedBlockingQueue.new(8)
121
+ policy = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
122
+ @exec = java.util.concurrent.ThreadPoolExecutor.new(1, 1, 1,
123
+ java.util.concurrent.TimeUnit::MINUTES,
124
+ queue,
125
+ policy)
126
+ @ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
127
+ @submitted = 0
128
+ @completed = 0
129
+ end
130
+
131
+ def submit(&blk)
132
+ @ecs.submit(&blk)
133
+ @submitted += 1
134
+ check_for_errors
135
+ end
136
+
137
+ def check_for_errors
138
+ while f = @ecs.poll
139
+ f.get
140
+ @completed += 1
141
+ end
142
+ end
143
+
144
+ def shutdown
145
+ @exec.shutdown
146
+ until @completed == @submitted
147
+ f = @ecs.take
148
+ f.get
149
+ @completed += 1
150
+ end
151
+ end
152
+ end
153
+
154
+ class DummyExecutor
155
+
156
+ def initialize
157
+ end
158
+
159
+ def submit
160
+ yield
161
+ end
162
+
163
+ def shutdown
164
+ end
165
+
166
+ end
167
+
168
+ end
@@ -58,10 +58,14 @@ module Bio
58
58
  attr_reader :sequences
59
59
  # Offset of the alignment block within the MAF file, in bytes.
60
60
  # @return [Integer]
61
- attr_reader :offset
61
+ attr_accessor :offset
62
62
  # Size of the alignment block within the MAF file, in bytes.
63
63
  # @return [Integer]
64
64
  attr_reader :size
65
+ # Original text of the MAF block. Only available if the
66
+ # :retain_text parser option is set.
67
+ # @return [String]
68
+ attr_accessor :orig_text
65
69
 
66
70
  def initialize(vars, sequences, offset, size, filtered)
67
71
  @vars = vars
@@ -90,6 +94,10 @@ module Bio
90
94
  sequences.first.text.size
91
95
  end
92
96
 
97
+ def upcase!
98
+ sequences.each { |s| s.upcase! }
99
+ end
100
+
93
101
  # Whether this block has been modified by a parser filter.
94
102
  # @return [Boolean]
95
103
  def filtered?
@@ -101,6 +109,13 @@ module Bio
101
109
  Bio::BioAlignment::Alignment.new(ba_seq)
102
110
  end
103
111
 
112
+ def to_s
113
+ buf = StringIO.new
114
+ writer = Writer.new(buf)
115
+ writer.write_block(self)
116
+ return buf.string
117
+ end
118
+
104
119
  GAP = /-+/
105
120
 
106
121
  # Find gaps present in all sequences. These would generally
@@ -356,6 +371,10 @@ module Bio
356
371
  end
357
372
  end
358
373
 
374
+ def upcase!
375
+ text.upcase!
376
+ end
377
+
359
378
  def to_bio_alignment
360
379
  Bio::BioAlignment::Sequence.new(source, text)
361
380
  end
@@ -471,6 +490,10 @@ module Bio
471
490
  true
472
491
  end
473
492
 
493
+ def upcase!
494
+ # no-op
495
+ end
496
+
474
497
  def write_fasta(writer)
475
498
  raise "empty sequence output not implemented!"
476
499
  end
@@ -1,4 +1,5 @@
1
1
  require 'strscan'
2
+ require 'zlib'
2
3
  require 'java' if RUBY_PLATFORM == 'java'
3
4
  require 'bio-bgzf'
4
5
 
@@ -104,6 +105,7 @@ module Bio
104
105
 
105
106
  # Spawn a read-ahead thread. Called from {#initialize}.
106
107
  def start_read_ahead
108
+ LOG.debug { "Starting read-ahead thread." }
107
109
  @read_thread = Thread.new { read_ahead }
108
110
  end
109
111
 
@@ -169,6 +171,7 @@ module Bio
169
171
  BLOCK_START = /^(?=a)/
170
172
  BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
171
173
  EOL_OR_EOF = /\n|\z/
174
+ JRUBY_P = (RUBY_PLATFORM == 'java')
172
175
 
173
176
  def set_last_block_pos!
174
177
  @last_block_pos = s.string.rindex(BLOCK_START)
@@ -333,14 +336,22 @@ module Bio
333
336
  elsif [I, E, Q, COMMENT, nil].include? first
334
337
  next
335
338
  else
336
- parse_error "unexpected line: '#{line}'"
339
+ if opts[:strict]
340
+ parse_error "unexpected line: '#{line}'"
341
+ else
342
+ LOG.warn "Ignoring invalid MAF line: '#{line}'"
343
+ end
337
344
  end
338
345
  end
339
- Block.new(block_vars,
340
- seqs,
341
- block_offset,
342
- s.pos - block_start_pos,
343
- filtered)
346
+ b = Block.new(block_vars,
347
+ seqs,
348
+ block_offset,
349
+ s.pos - block_start_pos,
350
+ filtered)
351
+ if opts[:retain_text]
352
+ b.orig_text = s.string.slice(block_start_pos...(s.pos))
353
+ end
354
+ return b
344
355
  end
345
356
 
346
357
  # Parse an 's' line.
@@ -504,12 +515,16 @@ module Bio
504
515
  # * `:parse_extended`: whether to parse 'i' and 'q' lines
505
516
  # * `:parse_empty`: whether to parse 'e' lines
506
517
  # * `:remove_gaps`: remove gaps left after filtering sequences
518
+ # * `:join_blocks`: join blocks where possible
519
+ # * `:upcase`: fold sequence data to upper case
507
520
  # * `:chunk_size`: read MAF file in chunks of this many bytes
508
521
  # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
509
522
  # * `:merge_max`: merge up to this many bytes of blocks for
510
523
  # random access
511
524
  # * `:threads`: number of threads to use for parallel
512
525
  # parsing. Only useful under JRuby.
526
+ # * `:strict`: abort on un-parseable lines instead of continuing with
527
+ # a warning.
513
528
  # @api public
514
529
 
515
530
  class Parser
@@ -519,8 +534,12 @@ module Bio
519
534
  attr_reader :header
520
535
  # @return [String] path of MAF file being parsed.
521
536
  attr_reader :file_spec
522
- # @return [File] file handle for MAF file.
537
+ # @return [IO] file handle for MAF file.
523
538
  attr_reader :f
539
+ # May be gzip-compressed.
540
+ # @return [IO] file handle for physical MAF file.
541
+ # @api private
542
+ attr_reader :phys_f
524
543
  # @return [StringScanner] scanner for parsing.
525
544
  attr_reader :s
526
545
  # @return [ChunkReader] ChunkReader.
@@ -547,33 +566,47 @@ module Bio
547
566
  RANDOM_CHUNK_SIZE = 4096
548
567
  MERGE_MAX = SEQ_CHUNK_SIZE
549
568
 
569
+ DEFAULT_OPTS = {
570
+ :chunk_size => SEQ_CHUNK_SIZE,
571
+ :random_chunk_size => RANDOM_CHUNK_SIZE,
572
+ :merge_max => MERGE_MAX,
573
+ :parse_extended => false,
574
+ :parse_empty => false,
575
+ :readahead_thread => true,
576
+ :seq_parse_thread => true
577
+ }
578
+ if JRUBY_P
579
+ DEFAULT_OPTS[:threads] = java.lang.Runtime.runtime.availableProcessors
580
+ end
581
+
550
582
  # Create a new parser instance.
551
583
  #
552
584
  # @param [String] file_spec path of file to parse.
553
- # @param [Hash] opts parser options.
585
+ # @param [Hash] parse_opts parser options.
554
586
  # @api public
555
- def initialize(file_spec, opts={})
587
+ def initialize(file_spec, parse_opts={})
588
+ opts = DEFAULT_OPTS.merge(parse_opts)
556
589
  @opts = opts
557
- if RUBY_PLATFORM == 'java'
558
- opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
559
- end
560
- chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
561
- @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
562
- @merge_max = opts[:merge_max] || MERGE_MAX
563
- @parse_extended = opts[:parse_extended] || false
564
- @parse_empty = opts[:parse_empty] || false
590
+ @random_access_chunk_size = opts[:random_chunk_size]
591
+ @merge_max = opts[:merge_max]
592
+ @parse_extended = opts[:parse_extended]
593
+ @parse_empty = opts[:parse_empty]
565
594
  @chunk_start = 0
566
595
  if file_spec.respond_to? :flush
596
+ # an IO object
567
597
  # guess what, Pathnames respond to :read...
568
598
  @f = file_spec
569
599
  @file_spec = @f.path if @f.respond_to?(:path)
570
- # TODO: gzip?
600
+ # TODO: test for gzip?
571
601
  else
602
+ # a pathname (or Pathname)
572
603
  @file_spec = file_spec
604
+ @phys_f = File.open(file_spec)
573
605
  if file_spec.to_s.end_with?(".maf.gz")
574
- @f = IO.popen("gzip -dc #{file_spec}")
606
+ @f = Zlib::GzipReader.new(@phys_f)
607
+ @compression = :gzip
575
608
  else
576
- @f = File.open(file_spec)
609
+ @f = @phys_f
577
610
  end
578
611
  end
579
612
  if @file_spec.to_s =~ /\.bgzf?$/
@@ -582,8 +615,9 @@ module Bio
582
615
  else
583
616
  @base_reader = ChunkReader
584
617
  end
585
- @cr = base_reader.new(@f, chunk_size)
586
- if RUBY_PLATFORM == 'java'
618
+ @cr = base_reader.new(@f, opts[:chunk_size])
619
+ if JRUBY_P && opts[:readahead_thread]
620
+ LOG.debug "Using ThreadedChunkReaderWrapper."
587
621
  @cr = ThreadedChunkReaderWrapper.new(@cr)
588
622
  end
589
623
  @s = StringScanner.new(cr.read_chunk())
@@ -649,7 +683,7 @@ module Bio
649
683
  def fetch_blocks(fetch_list, &blk)
650
684
  if blk
651
685
  merged = merge_fetch_list(fetch_list)
652
- if RUBY_PLATFORM == 'java' && @opts.fetch(:threads, 1) > 1
686
+ if JRUBY_P && @opts.fetch(:threads, 1) > 1
653
687
  fun = lambda { |&b2| fetch_blocks_merged_parallel(merged, &b2) }
654
688
  else
655
689
  fun = lambda { |&b2| fetch_blocks_merged(merged, &b2) }
@@ -667,15 +701,17 @@ module Bio
667
701
  def fetch_blocks_merged(fetch_list, &blk)
668
702
  start = Time.now
669
703
  total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
704
+ count = 0
670
705
  with_context(@random_access_chunk_size) do |ctx|
671
706
  fetch_list.each do |e|
672
707
  ctx.fetch_blocks(*e, &blk)
708
+ count += 1
673
709
  end
674
710
  end
675
711
  elapsed = Time.now - start
676
712
  rate = (total_size / 1048576.0) / elapsed
677
- LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
678
- elapsed, rate) }
713
+ LOG.debug { sprintf("Fetched %d blocks in %.3fs, %.1f MB/s.",
714
+ count, elapsed, rate) }
679
715
  end
680
716
 
681
717
  # Fetch and parse the blocks given by the merged fetch list, in
@@ -807,7 +843,9 @@ module Bio
807
843
  end
808
844
  end
809
845
  @header = Header.new(vars, align_params)
810
- s.skip_until BLOCK_START || parse_error("Cannot find block start!")
846
+ if ! s.skip_until(BLOCK_START)
847
+ @at_end = true
848
+ end
811
849
  end
812
850
 
813
851
  # Parse all alignment blocks until EOF.
@@ -820,7 +858,7 @@ module Bio
820
858
  # @api public
821
859
  def each_block(&blk)
822
860
  if block_given?
823
- if RUBY_PLATFORM == 'java' && @opts.has_key?(:threads)
861
+ if JRUBY_P && opts[:seq_parse_thread]
824
862
  fun = method(:parse_blocks_parallel)
825
863
  else
826
864
  fun = method(:each_block_seq)
@@ -847,11 +885,12 @@ module Bio
847
885
  b
848
886
  end
849
887
 
850
- WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps]
888
+ WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps, :upcase]
851
889
 
852
890
  def wrap_block_seq(fun, &blk)
853
891
  opts = WRAP_OPTS.find_all { |o| @opts[o] }
854
892
  opts << :sequence_filter if sequence_filter && (! sequence_filter.empty?)
893
+ LOG.debug { "wrapping #{fun} with #{opts.inspect}" }
855
894
  _wrap(opts, fun, &blk)
856
895
  end
857
896
 
@@ -873,6 +912,12 @@ module Bio
873
912
  fun,
874
913
  :to_bio_alignment,
875
914
  &blk)
915
+ when :upcase
916
+ conv_send(options,
917
+ fun,
918
+ :upcase!,
919
+ true,
920
+ &blk)
876
921
  when :remove_gaps
877
922
  conv_map(options,
878
923
  fun,
@@ -910,10 +955,14 @@ module Bio
910
955
  end
911
956
  end
912
957
 
913
- def conv_send(options, search, sym)
958
+ def conv_send(options, search, sym, always_yield_block=false)
914
959
  _wrap(options, search) do |block|
915
960
  v = block.send(sym)
916
- yield v if v
961
+ if always_yield_block
962
+ yield block
963
+ else
964
+ yield v if v
965
+ end
917
966
  end
918
967
  end
919
968
 
@@ -925,14 +974,17 @@ module Bio
925
974
  queue = java.util.concurrent.LinkedBlockingQueue.new(128)
926
975
  worker = Thread.new do
927
976
  begin
977
+ LOG.debug "Starting parse worker."
928
978
  until at_end
929
979
  block = _parse_block()
930
980
  queue.put(block) if block
931
981
  end
932
982
  queue.put(:eof)
933
- rescue
934
- LOG.error "worker exiting: #{$!.class}: #{$!}"
983
+ LOG.debug { "Parse worker reached EOF." }
984
+ rescue Exception
935
985
  LOG.error $!
986
+ Thread.current[:exception] = $!
987
+ raise
936
988
  end
937
989
  end
938
990
  saw_eof = false
@@ -946,12 +998,15 @@ module Bio
946
998
  yield block
947
999
  else
948
1000
  # timed out
949
- n_final_poll += 1 unless worker.alive?
1001
+ unless worker.alive?
1002
+ LOG.debug "Worker has exited."
1003
+ n_final_poll += 1
1004
+ end
950
1005
  end
951
1006
  break if n_final_poll > 1
952
1007
  end
953
1008
  unless saw_eof
954
- raise "worker exited unexpectedly!"
1009
+ raise "worker exited unexpectedly from #{worker[:exception]}!"
955
1010
  end
956
1011
  end
957
1012
 
@@ -1000,7 +1055,7 @@ module Bio
1000
1055
 
1001
1056
  def handle_logging_options(opts)
1002
1057
  opts.on("--logger filename", String,
1003
- "Log to file (default STDOUT)") do |name|
1058
+ "Log to file (default STDERR)") do |name|
1004
1059
  Bio::Log::CLI.logger(name)
1005
1060
  end
1006
1061
  opts.on("--trace options", String,