bio-maf 1.0.0-java → 1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ require 'set'
2
+ require 'java' if RUBY_PLATFORM == 'java'
3
+
4
+ module Bio::MAF
5
+
6
+ module JobRunner
7
+ def JobRunner.create(n_parallel)
8
+ if RUBY_PLATFORM == 'java'
9
+ JThreadRunner.new(n_parallel)
10
+ else
11
+ ForkRunner.new(n_parallel)
12
+ end
13
+ end
14
+ end
15
+
16
+ class ForkRunner
17
+
18
+ def initialize(n_parallel)
19
+ @n_parallel = n_parallel
20
+ @jobs = []
21
+ @kids = Set.new
22
+ end
23
+
24
+ def add(&proc)
25
+ @jobs << proc
26
+ end
27
+
28
+ def run
29
+ until @jobs.empty? && @kids.empty?
30
+ while can_start?
31
+ start_job
32
+ end
33
+ await
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def can_start?
40
+ (! @jobs.empty?) && @kids.size < @n_parallel
41
+ end
42
+
43
+ def start_job
44
+ job = @jobs.shift
45
+ pid = fork()
46
+ if pid
47
+ # parent
48
+ @kids << pid
49
+ else
50
+ # child
51
+ begin
52
+ job.call()
53
+ exit 0
54
+ rescue SystemExit
55
+ raise
56
+ rescue Exception
57
+ LOG.error $!
58
+ exit 1
59
+ end
60
+ end
61
+ end
62
+
63
+ def await
64
+ pid = Process.wait
65
+ unless @kids.delete?(pid)
66
+ raise "Completion of unexpected job #{pid}!"
67
+ end
68
+ if ! $?.success?
69
+ raise "Job #{pid} failed with status #{status.exitstatus}!"
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ class JThreadRunner
76
+
77
+ def initialize(n_parallel)
78
+ @n_parallel = n_parallel
79
+ @exec = java.util.concurrent.Executors.newFixedThreadPool(n_parallel)
80
+ @ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
81
+ @n = 0
82
+ end
83
+
84
+ def add(&blk)
85
+ @ecs.submit(&blk)
86
+ @n += 1
87
+ end
88
+
89
+ def run
90
+ seen = 0
91
+ until seen == @n
92
+ f = @ecs.take()
93
+ begin
94
+ f.get()
95
+ rescue Exception => e
96
+ LOG.error e
97
+ @exec.shutdownNow()
98
+ raise
99
+ end
100
+ seen += 1
101
+ end
102
+ @exec.shutdown()
103
+ end
104
+
105
+ end
106
+
107
+ module Executor
108
+ def Executor.create
109
+ if RUBY_PLATFORM == 'java'
110
+ JExecutor.new
111
+ else
112
+ DummyExecutor.new
113
+ end
114
+ end
115
+ end
116
+
117
+ class JExecutor
118
+
119
+ def initialize
120
+ queue = java.util.concurrent.LinkedBlockingQueue.new(8)
121
+ policy = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
122
+ @exec = java.util.concurrent.ThreadPoolExecutor.new(1, 1, 1,
123
+ java.util.concurrent.TimeUnit::MINUTES,
124
+ queue,
125
+ policy)
126
+ @ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
127
+ @submitted = 0
128
+ @completed = 0
129
+ end
130
+
131
+ def submit(&blk)
132
+ @ecs.submit(&blk)
133
+ @submitted += 1
134
+ check_for_errors
135
+ end
136
+
137
+ def check_for_errors
138
+ while f = @ecs.poll
139
+ f.get
140
+ @completed += 1
141
+ end
142
+ end
143
+
144
+ def shutdown
145
+ @exec.shutdown
146
+ until @completed == @submitted
147
+ f = @ecs.take
148
+ f.get
149
+ @completed += 1
150
+ end
151
+ end
152
+ end
153
+
154
+ class DummyExecutor
155
+
156
+ def initialize
157
+ end
158
+
159
+ def submit
160
+ yield
161
+ end
162
+
163
+ def shutdown
164
+ end
165
+
166
+ end
167
+
168
+ end
@@ -58,10 +58,14 @@ module Bio
58
58
  attr_reader :sequences
59
59
  # Offset of the alignment block within the MAF file, in bytes.
60
60
  # @return [Integer]
61
- attr_reader :offset
61
+ attr_accessor :offset
62
62
  # Size of the alignment block within the MAF file, in bytes.
63
63
  # @return [Integer]
64
64
  attr_reader :size
65
+ # Original text of the MAF block. Only available if the
66
+ # :retain_text parser option is set.
67
+ # @return [String]
68
+ attr_accessor :orig_text
65
69
 
66
70
  def initialize(vars, sequences, offset, size, filtered)
67
71
  @vars = vars
@@ -90,6 +94,10 @@ module Bio
90
94
  sequences.first.text.size
91
95
  end
92
96
 
97
+ def upcase!
98
+ sequences.each { |s| s.upcase! }
99
+ end
100
+
93
101
  # Whether this block has been modified by a parser filter.
94
102
  # @return [Boolean]
95
103
  def filtered?
@@ -101,6 +109,13 @@ module Bio
101
109
  Bio::BioAlignment::Alignment.new(ba_seq)
102
110
  end
103
111
 
112
+ def to_s
113
+ buf = StringIO.new
114
+ writer = Writer.new(buf)
115
+ writer.write_block(self)
116
+ return buf.string
117
+ end
118
+
104
119
  GAP = /-+/
105
120
 
106
121
  # Find gaps present in all sequences. These would generally
@@ -356,6 +371,10 @@ module Bio
356
371
  end
357
372
  end
358
373
 
374
+ def upcase!
375
+ text.upcase!
376
+ end
377
+
359
378
  def to_bio_alignment
360
379
  Bio::BioAlignment::Sequence.new(source, text)
361
380
  end
@@ -471,6 +490,10 @@ module Bio
471
490
  true
472
491
  end
473
492
 
493
+ def upcase!
494
+ # no-op
495
+ end
496
+
474
497
  def write_fasta(writer)
475
498
  raise "empty sequence output not implemented!"
476
499
  end
@@ -1,4 +1,5 @@
1
1
  require 'strscan'
2
+ require 'zlib'
2
3
  require 'java' if RUBY_PLATFORM == 'java'
3
4
  require 'bio-bgzf'
4
5
 
@@ -104,6 +105,7 @@ module Bio
104
105
 
105
106
  # Spawn a read-ahead thread. Called from {#initialize}.
106
107
  def start_read_ahead
108
+ LOG.debug { "Starting read-ahead thread." }
107
109
  @read_thread = Thread.new { read_ahead }
108
110
  end
109
111
 
@@ -169,6 +171,7 @@ module Bio
169
171
  BLOCK_START = /^(?=a)/
170
172
  BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
171
173
  EOL_OR_EOF = /\n|\z/
174
+ JRUBY_P = (RUBY_PLATFORM == 'java')
172
175
 
173
176
  def set_last_block_pos!
174
177
  @last_block_pos = s.string.rindex(BLOCK_START)
@@ -333,14 +336,22 @@ module Bio
333
336
  elsif [I, E, Q, COMMENT, nil].include? first
334
337
  next
335
338
  else
336
- parse_error "unexpected line: '#{line}'"
339
+ if opts[:strict]
340
+ parse_error "unexpected line: '#{line}'"
341
+ else
342
+ LOG.warn "Ignoring invalid MAF line: '#{line}'"
343
+ end
337
344
  end
338
345
  end
339
- Block.new(block_vars,
340
- seqs,
341
- block_offset,
342
- s.pos - block_start_pos,
343
- filtered)
346
+ b = Block.new(block_vars,
347
+ seqs,
348
+ block_offset,
349
+ s.pos - block_start_pos,
350
+ filtered)
351
+ if opts[:retain_text]
352
+ b.orig_text = s.string.slice(block_start_pos...(s.pos))
353
+ end
354
+ return b
344
355
  end
345
356
 
346
357
  # Parse an 's' line.
@@ -504,12 +515,16 @@ module Bio
504
515
  # * `:parse_extended`: whether to parse 'i' and 'q' lines
505
516
  # * `:parse_empty`: whether to parse 'e' lines
506
517
  # * `:remove_gaps`: remove gaps left after filtering sequences
518
+ # * `:join_blocks`: join blocks where possible
519
+ # * `:upcase`: fold sequence data to upper case
507
520
  # * `:chunk_size`: read MAF file in chunks of this many bytes
508
521
  # * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
509
522
  # * `:merge_max`: merge up to this many bytes of blocks for
510
523
  # random access
511
524
  # * `:threads`: number of threads to use for parallel
512
525
  # parsing. Only useful under JRuby.
526
+ # * `:strict`: abort on un-parseable lines instead of continuing with
527
+ # a warning.
513
528
  # @api public
514
529
 
515
530
  class Parser
@@ -519,8 +534,12 @@ module Bio
519
534
  attr_reader :header
520
535
  # @return [String] path of MAF file being parsed.
521
536
  attr_reader :file_spec
522
- # @return [File] file handle for MAF file.
537
+ # @return [IO] file handle for MAF file.
523
538
  attr_reader :f
539
+ # May be gzip-compressed.
540
+ # @return [IO] file handle for physical MAF file.
541
+ # @api private
542
+ attr_reader :phys_f
524
543
  # @return [StringScanner] scanner for parsing.
525
544
  attr_reader :s
526
545
  # @return [ChunkReader] ChunkReader.
@@ -547,33 +566,47 @@ module Bio
547
566
  RANDOM_CHUNK_SIZE = 4096
548
567
  MERGE_MAX = SEQ_CHUNK_SIZE
549
568
 
569
+ DEFAULT_OPTS = {
570
+ :chunk_size => SEQ_CHUNK_SIZE,
571
+ :random_chunk_size => RANDOM_CHUNK_SIZE,
572
+ :merge_max => MERGE_MAX,
573
+ :parse_extended => false,
574
+ :parse_empty => false,
575
+ :readahead_thread => true,
576
+ :seq_parse_thread => true
577
+ }
578
+ if JRUBY_P
579
+ DEFAULT_OPTS[:threads] = java.lang.Runtime.runtime.availableProcessors
580
+ end
581
+
550
582
  # Create a new parser instance.
551
583
  #
552
584
  # @param [String] file_spec path of file to parse.
553
- # @param [Hash] opts parser options.
585
+ # @param [Hash] parse_opts parser options.
554
586
  # @api public
555
- def initialize(file_spec, opts={})
587
+ def initialize(file_spec, parse_opts={})
588
+ opts = DEFAULT_OPTS.merge(parse_opts)
556
589
  @opts = opts
557
- if RUBY_PLATFORM == 'java'
558
- opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
559
- end
560
- chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
561
- @random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
562
- @merge_max = opts[:merge_max] || MERGE_MAX
563
- @parse_extended = opts[:parse_extended] || false
564
- @parse_empty = opts[:parse_empty] || false
590
+ @random_access_chunk_size = opts[:random_chunk_size]
591
+ @merge_max = opts[:merge_max]
592
+ @parse_extended = opts[:parse_extended]
593
+ @parse_empty = opts[:parse_empty]
565
594
  @chunk_start = 0
566
595
  if file_spec.respond_to? :flush
596
+ # an IO object
567
597
  # guess what, Pathnames respond to :read...
568
598
  @f = file_spec
569
599
  @file_spec = @f.path if @f.respond_to?(:path)
570
- # TODO: gzip?
600
+ # TODO: test for gzip?
571
601
  else
602
+ # a pathname (or Pathname)
572
603
  @file_spec = file_spec
604
+ @phys_f = File.open(file_spec)
573
605
  if file_spec.to_s.end_with?(".maf.gz")
574
- @f = IO.popen("gzip -dc #{file_spec}")
606
+ @f = Zlib::GzipReader.new(@phys_f)
607
+ @compression = :gzip
575
608
  else
576
- @f = File.open(file_spec)
609
+ @f = @phys_f
577
610
  end
578
611
  end
579
612
  if @file_spec.to_s =~ /\.bgzf?$/
@@ -582,8 +615,9 @@ module Bio
582
615
  else
583
616
  @base_reader = ChunkReader
584
617
  end
585
- @cr = base_reader.new(@f, chunk_size)
586
- if RUBY_PLATFORM == 'java'
618
+ @cr = base_reader.new(@f, opts[:chunk_size])
619
+ if JRUBY_P && opts[:readahead_thread]
620
+ LOG.debug "Using ThreadedChunkReaderWrapper."
587
621
  @cr = ThreadedChunkReaderWrapper.new(@cr)
588
622
  end
589
623
  @s = StringScanner.new(cr.read_chunk())
@@ -649,7 +683,7 @@ module Bio
649
683
  def fetch_blocks(fetch_list, &blk)
650
684
  if blk
651
685
  merged = merge_fetch_list(fetch_list)
652
- if RUBY_PLATFORM == 'java' && @opts.fetch(:threads, 1) > 1
686
+ if JRUBY_P && @opts.fetch(:threads, 1) > 1
653
687
  fun = lambda { |&b2| fetch_blocks_merged_parallel(merged, &b2) }
654
688
  else
655
689
  fun = lambda { |&b2| fetch_blocks_merged(merged, &b2) }
@@ -667,15 +701,17 @@ module Bio
667
701
  def fetch_blocks_merged(fetch_list, &blk)
668
702
  start = Time.now
669
703
  total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
704
+ count = 0
670
705
  with_context(@random_access_chunk_size) do |ctx|
671
706
  fetch_list.each do |e|
672
707
  ctx.fetch_blocks(*e, &blk)
708
+ count += 1
673
709
  end
674
710
  end
675
711
  elapsed = Time.now - start
676
712
  rate = (total_size / 1048576.0) / elapsed
677
- LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
678
- elapsed, rate) }
713
+ LOG.debug { sprintf("Fetched %d blocks in %.3fs, %.1f MB/s.",
714
+ count, elapsed, rate) }
679
715
  end
680
716
 
681
717
  # Fetch and parse the blocks given by the merged fetch list, in
@@ -807,7 +843,9 @@ module Bio
807
843
  end
808
844
  end
809
845
  @header = Header.new(vars, align_params)
810
- s.skip_until BLOCK_START || parse_error("Cannot find block start!")
846
+ if ! s.skip_until(BLOCK_START)
847
+ @at_end = true
848
+ end
811
849
  end
812
850
 
813
851
  # Parse all alignment blocks until EOF.
@@ -820,7 +858,7 @@ module Bio
820
858
  # @api public
821
859
  def each_block(&blk)
822
860
  if block_given?
823
- if RUBY_PLATFORM == 'java' && @opts.has_key?(:threads)
861
+ if JRUBY_P && opts[:seq_parse_thread]
824
862
  fun = method(:parse_blocks_parallel)
825
863
  else
826
864
  fun = method(:each_block_seq)
@@ -847,11 +885,12 @@ module Bio
847
885
  b
848
886
  end
849
887
 
850
- WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps]
888
+ WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps, :upcase]
851
889
 
852
890
  def wrap_block_seq(fun, &blk)
853
891
  opts = WRAP_OPTS.find_all { |o| @opts[o] }
854
892
  opts << :sequence_filter if sequence_filter && (! sequence_filter.empty?)
893
+ LOG.debug { "wrapping #{fun} with #{opts.inspect}" }
855
894
  _wrap(opts, fun, &blk)
856
895
  end
857
896
 
@@ -873,6 +912,12 @@ module Bio
873
912
  fun,
874
913
  :to_bio_alignment,
875
914
  &blk)
915
+ when :upcase
916
+ conv_send(options,
917
+ fun,
918
+ :upcase!,
919
+ true,
920
+ &blk)
876
921
  when :remove_gaps
877
922
  conv_map(options,
878
923
  fun,
@@ -910,10 +955,14 @@ module Bio
910
955
  end
911
956
  end
912
957
 
913
- def conv_send(options, search, sym)
958
+ def conv_send(options, search, sym, always_yield_block=false)
914
959
  _wrap(options, search) do |block|
915
960
  v = block.send(sym)
916
- yield v if v
961
+ if always_yield_block
962
+ yield block
963
+ else
964
+ yield v if v
965
+ end
917
966
  end
918
967
  end
919
968
 
@@ -925,14 +974,17 @@ module Bio
925
974
  queue = java.util.concurrent.LinkedBlockingQueue.new(128)
926
975
  worker = Thread.new do
927
976
  begin
977
+ LOG.debug "Starting parse worker."
928
978
  until at_end
929
979
  block = _parse_block()
930
980
  queue.put(block) if block
931
981
  end
932
982
  queue.put(:eof)
933
- rescue
934
- LOG.error "worker exiting: #{$!.class}: #{$!}"
983
+ LOG.debug { "Parse worker reached EOF." }
984
+ rescue Exception
935
985
  LOG.error $!
986
+ Thread.current[:exception] = $!
987
+ raise
936
988
  end
937
989
  end
938
990
  saw_eof = false
@@ -946,12 +998,15 @@ module Bio
946
998
  yield block
947
999
  else
948
1000
  # timed out
949
- n_final_poll += 1 unless worker.alive?
1001
+ unless worker.alive?
1002
+ LOG.debug "Worker has exited."
1003
+ n_final_poll += 1
1004
+ end
950
1005
  end
951
1006
  break if n_final_poll > 1
952
1007
  end
953
1008
  unless saw_eof
954
- raise "worker exited unexpectedly!"
1009
+ raise "worker exited unexpectedly from #{worker[:exception]}!"
955
1010
  end
956
1011
  end
957
1012
 
@@ -1000,7 +1055,7 @@ module Bio
1000
1055
 
1001
1056
  def handle_logging_options(opts)
1002
1057
  opts.on("--logger filename", String,
1003
- "Log to file (default STDOUT)") do |name|
1058
+ "Log to file (default STDERR)") do |name|
1004
1059
  Bio::Log::CLI.logger(name)
1005
1060
  end
1006
1061
  opts.on("--trace options", String,