bio-maf 1.0.0-java → 1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/lib/bio/maf/jobs.rb
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'java' if RUBY_PLATFORM == 'java'
|
3
|
+
|
4
|
+
module Bio::MAF
|
5
|
+
|
6
|
+
module JobRunner
|
7
|
+
def JobRunner.create(n_parallel)
|
8
|
+
if RUBY_PLATFORM == 'java'
|
9
|
+
JThreadRunner.new(n_parallel)
|
10
|
+
else
|
11
|
+
ForkRunner.new(n_parallel)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class ForkRunner
|
17
|
+
|
18
|
+
def initialize(n_parallel)
|
19
|
+
@n_parallel = n_parallel
|
20
|
+
@jobs = []
|
21
|
+
@kids = Set.new
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(&proc)
|
25
|
+
@jobs << proc
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
until @jobs.empty? && @kids.empty?
|
30
|
+
while can_start?
|
31
|
+
start_job
|
32
|
+
end
|
33
|
+
await
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def can_start?
|
40
|
+
(! @jobs.empty?) && @kids.size < @n_parallel
|
41
|
+
end
|
42
|
+
|
43
|
+
def start_job
|
44
|
+
job = @jobs.shift
|
45
|
+
pid = fork()
|
46
|
+
if pid
|
47
|
+
# parent
|
48
|
+
@kids << pid
|
49
|
+
else
|
50
|
+
# child
|
51
|
+
begin
|
52
|
+
job.call()
|
53
|
+
exit 0
|
54
|
+
rescue SystemExit
|
55
|
+
raise
|
56
|
+
rescue Exception
|
57
|
+
LOG.error $!
|
58
|
+
exit 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def await
|
64
|
+
pid = Process.wait
|
65
|
+
unless @kids.delete?(pid)
|
66
|
+
raise "Completion of unexpected job #{pid}!"
|
67
|
+
end
|
68
|
+
if ! $?.success?
|
69
|
+
raise "Job #{pid} failed with status #{status.exitstatus}!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
class JThreadRunner
|
76
|
+
|
77
|
+
def initialize(n_parallel)
|
78
|
+
@n_parallel = n_parallel
|
79
|
+
@exec = java.util.concurrent.Executors.newFixedThreadPool(n_parallel)
|
80
|
+
@ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
|
81
|
+
@n = 0
|
82
|
+
end
|
83
|
+
|
84
|
+
def add(&blk)
|
85
|
+
@ecs.submit(&blk)
|
86
|
+
@n += 1
|
87
|
+
end
|
88
|
+
|
89
|
+
def run
|
90
|
+
seen = 0
|
91
|
+
until seen == @n
|
92
|
+
f = @ecs.take()
|
93
|
+
begin
|
94
|
+
f.get()
|
95
|
+
rescue Exception => e
|
96
|
+
LOG.error e
|
97
|
+
@exec.shutdownNow()
|
98
|
+
raise
|
99
|
+
end
|
100
|
+
seen += 1
|
101
|
+
end
|
102
|
+
@exec.shutdown()
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
module Executor
|
108
|
+
def Executor.create
|
109
|
+
if RUBY_PLATFORM == 'java'
|
110
|
+
JExecutor.new
|
111
|
+
else
|
112
|
+
DummyExecutor.new
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class JExecutor
|
118
|
+
|
119
|
+
def initialize
|
120
|
+
queue = java.util.concurrent.LinkedBlockingQueue.new(8)
|
121
|
+
policy = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
|
122
|
+
@exec = java.util.concurrent.ThreadPoolExecutor.new(1, 1, 1,
|
123
|
+
java.util.concurrent.TimeUnit::MINUTES,
|
124
|
+
queue,
|
125
|
+
policy)
|
126
|
+
@ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
|
127
|
+
@submitted = 0
|
128
|
+
@completed = 0
|
129
|
+
end
|
130
|
+
|
131
|
+
def submit(&blk)
|
132
|
+
@ecs.submit(&blk)
|
133
|
+
@submitted += 1
|
134
|
+
check_for_errors
|
135
|
+
end
|
136
|
+
|
137
|
+
def check_for_errors
|
138
|
+
while f = @ecs.poll
|
139
|
+
f.get
|
140
|
+
@completed += 1
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def shutdown
|
145
|
+
@exec.shutdown
|
146
|
+
until @completed == @submitted
|
147
|
+
f = @ecs.take
|
148
|
+
f.get
|
149
|
+
@completed += 1
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
class DummyExecutor
|
155
|
+
|
156
|
+
def initialize
|
157
|
+
end
|
158
|
+
|
159
|
+
def submit
|
160
|
+
yield
|
161
|
+
end
|
162
|
+
|
163
|
+
def shutdown
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -58,10 +58,14 @@ module Bio
|
|
58
58
|
attr_reader :sequences
|
59
59
|
# Offset of the alignment block within the MAF file, in bytes.
|
60
60
|
# @return [Integer]
|
61
|
-
|
61
|
+
attr_accessor :offset
|
62
62
|
# Size of the alignment block within the MAF file, in bytes.
|
63
63
|
# @return [Integer]
|
64
64
|
attr_reader :size
|
65
|
+
# Original text of the MAF block. Only available if the
|
66
|
+
# :retain_text parser option is set.
|
67
|
+
# @return [String]
|
68
|
+
attr_accessor :orig_text
|
65
69
|
|
66
70
|
def initialize(vars, sequences, offset, size, filtered)
|
67
71
|
@vars = vars
|
@@ -90,6 +94,10 @@ module Bio
|
|
90
94
|
sequences.first.text.size
|
91
95
|
end
|
92
96
|
|
97
|
+
def upcase!
|
98
|
+
sequences.each { |s| s.upcase! }
|
99
|
+
end
|
100
|
+
|
93
101
|
# Whether this block has been modified by a parser filter.
|
94
102
|
# @return [Boolean]
|
95
103
|
def filtered?
|
@@ -101,6 +109,13 @@ module Bio
|
|
101
109
|
Bio::BioAlignment::Alignment.new(ba_seq)
|
102
110
|
end
|
103
111
|
|
112
|
+
def to_s
|
113
|
+
buf = StringIO.new
|
114
|
+
writer = Writer.new(buf)
|
115
|
+
writer.write_block(self)
|
116
|
+
return buf.string
|
117
|
+
end
|
118
|
+
|
104
119
|
GAP = /-+/
|
105
120
|
|
106
121
|
# Find gaps present in all sequences. These would generally
|
@@ -356,6 +371,10 @@ module Bio
|
|
356
371
|
end
|
357
372
|
end
|
358
373
|
|
374
|
+
def upcase!
|
375
|
+
text.upcase!
|
376
|
+
end
|
377
|
+
|
359
378
|
def to_bio_alignment
|
360
379
|
Bio::BioAlignment::Sequence.new(source, text)
|
361
380
|
end
|
@@ -471,6 +490,10 @@ module Bio
|
|
471
490
|
true
|
472
491
|
end
|
473
492
|
|
493
|
+
def upcase!
|
494
|
+
# no-op
|
495
|
+
end
|
496
|
+
|
474
497
|
def write_fasta(writer)
|
475
498
|
raise "empty sequence output not implemented!"
|
476
499
|
end
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'strscan'
|
2
|
+
require 'zlib'
|
2
3
|
require 'java' if RUBY_PLATFORM == 'java'
|
3
4
|
require 'bio-bgzf'
|
4
5
|
|
@@ -104,6 +105,7 @@ module Bio
|
|
104
105
|
|
105
106
|
# Spawn a read-ahead thread. Called from {#initialize}.
|
106
107
|
def start_read_ahead
|
108
|
+
LOG.debug { "Starting read-ahead thread." }
|
107
109
|
@read_thread = Thread.new { read_ahead }
|
108
110
|
end
|
109
111
|
|
@@ -169,6 +171,7 @@ module Bio
|
|
169
171
|
BLOCK_START = /^(?=a)/
|
170
172
|
BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
|
171
173
|
EOL_OR_EOF = /\n|\z/
|
174
|
+
JRUBY_P = (RUBY_PLATFORM == 'java')
|
172
175
|
|
173
176
|
def set_last_block_pos!
|
174
177
|
@last_block_pos = s.string.rindex(BLOCK_START)
|
@@ -333,14 +336,22 @@ module Bio
|
|
333
336
|
elsif [I, E, Q, COMMENT, nil].include? first
|
334
337
|
next
|
335
338
|
else
|
336
|
-
|
339
|
+
if opts[:strict]
|
340
|
+
parse_error "unexpected line: '#{line}'"
|
341
|
+
else
|
342
|
+
LOG.warn "Ignoring invalid MAF line: '#{line}'"
|
343
|
+
end
|
337
344
|
end
|
338
345
|
end
|
339
|
-
Block.new(block_vars,
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
346
|
+
b = Block.new(block_vars,
|
347
|
+
seqs,
|
348
|
+
block_offset,
|
349
|
+
s.pos - block_start_pos,
|
350
|
+
filtered)
|
351
|
+
if opts[:retain_text]
|
352
|
+
b.orig_text = s.string.slice(block_start_pos...(s.pos))
|
353
|
+
end
|
354
|
+
return b
|
344
355
|
end
|
345
356
|
|
346
357
|
# Parse an 's' line.
|
@@ -504,12 +515,16 @@ module Bio
|
|
504
515
|
# * `:parse_extended`: whether to parse 'i' and 'q' lines
|
505
516
|
# * `:parse_empty`: whether to parse 'e' lines
|
506
517
|
# * `:remove_gaps`: remove gaps left after filtering sequences
|
518
|
+
# * `:join_blocks`: join blocks where possible
|
519
|
+
# * `:upcase`: fold sequence data to upper case
|
507
520
|
# * `:chunk_size`: read MAF file in chunks of this many bytes
|
508
521
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
509
522
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
510
523
|
# random access
|
511
524
|
# * `:threads`: number of threads to use for parallel
|
512
525
|
# parsing. Only useful under JRuby.
|
526
|
+
# * `:strict`: abort on un-parseable lines instead of continuing with
|
527
|
+
# a warning.
|
513
528
|
# @api public
|
514
529
|
|
515
530
|
class Parser
|
@@ -519,8 +534,12 @@ module Bio
|
|
519
534
|
attr_reader :header
|
520
535
|
# @return [String] path of MAF file being parsed.
|
521
536
|
attr_reader :file_spec
|
522
|
-
# @return [
|
537
|
+
# @return [IO] file handle for MAF file.
|
523
538
|
attr_reader :f
|
539
|
+
# May be gzip-compressed.
|
540
|
+
# @return [IO] file handle for physical MAF file.
|
541
|
+
# @api private
|
542
|
+
attr_reader :phys_f
|
524
543
|
# @return [StringScanner] scanner for parsing.
|
525
544
|
attr_reader :s
|
526
545
|
# @return [ChunkReader] ChunkReader.
|
@@ -547,33 +566,47 @@ module Bio
|
|
547
566
|
RANDOM_CHUNK_SIZE = 4096
|
548
567
|
MERGE_MAX = SEQ_CHUNK_SIZE
|
549
568
|
|
569
|
+
DEFAULT_OPTS = {
|
570
|
+
:chunk_size => SEQ_CHUNK_SIZE,
|
571
|
+
:random_chunk_size => RANDOM_CHUNK_SIZE,
|
572
|
+
:merge_max => MERGE_MAX,
|
573
|
+
:parse_extended => false,
|
574
|
+
:parse_empty => false,
|
575
|
+
:readahead_thread => true,
|
576
|
+
:seq_parse_thread => true
|
577
|
+
}
|
578
|
+
if JRUBY_P
|
579
|
+
DEFAULT_OPTS[:threads] = java.lang.Runtime.runtime.availableProcessors
|
580
|
+
end
|
581
|
+
|
550
582
|
# Create a new parser instance.
|
551
583
|
#
|
552
584
|
# @param [String] file_spec path of file to parse.
|
553
|
-
# @param [Hash]
|
585
|
+
# @param [Hash] parse_opts parser options.
|
554
586
|
# @api public
|
555
|
-
def initialize(file_spec,
|
587
|
+
def initialize(file_spec, parse_opts={})
|
588
|
+
opts = DEFAULT_OPTS.merge(parse_opts)
|
556
589
|
@opts = opts
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
@random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
|
562
|
-
@merge_max = opts[:merge_max] || MERGE_MAX
|
563
|
-
@parse_extended = opts[:parse_extended] || false
|
564
|
-
@parse_empty = opts[:parse_empty] || false
|
590
|
+
@random_access_chunk_size = opts[:random_chunk_size]
|
591
|
+
@merge_max = opts[:merge_max]
|
592
|
+
@parse_extended = opts[:parse_extended]
|
593
|
+
@parse_empty = opts[:parse_empty]
|
565
594
|
@chunk_start = 0
|
566
595
|
if file_spec.respond_to? :flush
|
596
|
+
# an IO object
|
567
597
|
# guess what, Pathnames respond to :read...
|
568
598
|
@f = file_spec
|
569
599
|
@file_spec = @f.path if @f.respond_to?(:path)
|
570
|
-
# TODO: gzip?
|
600
|
+
# TODO: test for gzip?
|
571
601
|
else
|
602
|
+
# a pathname (or Pathname)
|
572
603
|
@file_spec = file_spec
|
604
|
+
@phys_f = File.open(file_spec)
|
573
605
|
if file_spec.to_s.end_with?(".maf.gz")
|
574
|
-
@f =
|
606
|
+
@f = Zlib::GzipReader.new(@phys_f)
|
607
|
+
@compression = :gzip
|
575
608
|
else
|
576
|
-
@f =
|
609
|
+
@f = @phys_f
|
577
610
|
end
|
578
611
|
end
|
579
612
|
if @file_spec.to_s =~ /\.bgzf?$/
|
@@ -582,8 +615,9 @@ module Bio
|
|
582
615
|
else
|
583
616
|
@base_reader = ChunkReader
|
584
617
|
end
|
585
|
-
@cr = base_reader.new(@f, chunk_size)
|
586
|
-
if
|
618
|
+
@cr = base_reader.new(@f, opts[:chunk_size])
|
619
|
+
if JRUBY_P && opts[:readahead_thread]
|
620
|
+
LOG.debug "Using ThreadedChunkReaderWrapper."
|
587
621
|
@cr = ThreadedChunkReaderWrapper.new(@cr)
|
588
622
|
end
|
589
623
|
@s = StringScanner.new(cr.read_chunk())
|
@@ -649,7 +683,7 @@ module Bio
|
|
649
683
|
def fetch_blocks(fetch_list, &blk)
|
650
684
|
if blk
|
651
685
|
merged = merge_fetch_list(fetch_list)
|
652
|
-
if
|
686
|
+
if JRUBY_P && @opts.fetch(:threads, 1) > 1
|
653
687
|
fun = lambda { |&b2| fetch_blocks_merged_parallel(merged, &b2) }
|
654
688
|
else
|
655
689
|
fun = lambda { |&b2| fetch_blocks_merged(merged, &b2) }
|
@@ -667,15 +701,17 @@ module Bio
|
|
667
701
|
def fetch_blocks_merged(fetch_list, &blk)
|
668
702
|
start = Time.now
|
669
703
|
total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
|
704
|
+
count = 0
|
670
705
|
with_context(@random_access_chunk_size) do |ctx|
|
671
706
|
fetch_list.each do |e|
|
672
707
|
ctx.fetch_blocks(*e, &blk)
|
708
|
+
count += 1
|
673
709
|
end
|
674
710
|
end
|
675
711
|
elapsed = Time.now - start
|
676
712
|
rate = (total_size / 1048576.0) / elapsed
|
677
|
-
LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
|
678
|
-
elapsed, rate) }
|
713
|
+
LOG.debug { sprintf("Fetched %d blocks in %.3fs, %.1f MB/s.",
|
714
|
+
count, elapsed, rate) }
|
679
715
|
end
|
680
716
|
|
681
717
|
# Fetch and parse the blocks given by the merged fetch list, in
|
@@ -807,7 +843,9 @@ module Bio
|
|
807
843
|
end
|
808
844
|
end
|
809
845
|
@header = Header.new(vars, align_params)
|
810
|
-
s.skip_until
|
846
|
+
if ! s.skip_until(BLOCK_START)
|
847
|
+
@at_end = true
|
848
|
+
end
|
811
849
|
end
|
812
850
|
|
813
851
|
# Parse all alignment blocks until EOF.
|
@@ -820,7 +858,7 @@ module Bio
|
|
820
858
|
# @api public
|
821
859
|
def each_block(&blk)
|
822
860
|
if block_given?
|
823
|
-
if
|
861
|
+
if JRUBY_P && opts[:seq_parse_thread]
|
824
862
|
fun = method(:parse_blocks_parallel)
|
825
863
|
else
|
826
864
|
fun = method(:each_block_seq)
|
@@ -847,11 +885,12 @@ module Bio
|
|
847
885
|
b
|
848
886
|
end
|
849
887
|
|
850
|
-
WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps]
|
888
|
+
WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps, :upcase]
|
851
889
|
|
852
890
|
def wrap_block_seq(fun, &blk)
|
853
891
|
opts = WRAP_OPTS.find_all { |o| @opts[o] }
|
854
892
|
opts << :sequence_filter if sequence_filter && (! sequence_filter.empty?)
|
893
|
+
LOG.debug { "wrapping #{fun} with #{opts.inspect}" }
|
855
894
|
_wrap(opts, fun, &blk)
|
856
895
|
end
|
857
896
|
|
@@ -873,6 +912,12 @@ module Bio
|
|
873
912
|
fun,
|
874
913
|
:to_bio_alignment,
|
875
914
|
&blk)
|
915
|
+
when :upcase
|
916
|
+
conv_send(options,
|
917
|
+
fun,
|
918
|
+
:upcase!,
|
919
|
+
true,
|
920
|
+
&blk)
|
876
921
|
when :remove_gaps
|
877
922
|
conv_map(options,
|
878
923
|
fun,
|
@@ -910,10 +955,14 @@ module Bio
|
|
910
955
|
end
|
911
956
|
end
|
912
957
|
|
913
|
-
def conv_send(options, search, sym)
|
958
|
+
def conv_send(options, search, sym, always_yield_block=false)
|
914
959
|
_wrap(options, search) do |block|
|
915
960
|
v = block.send(sym)
|
916
|
-
|
961
|
+
if always_yield_block
|
962
|
+
yield block
|
963
|
+
else
|
964
|
+
yield v if v
|
965
|
+
end
|
917
966
|
end
|
918
967
|
end
|
919
968
|
|
@@ -925,14 +974,17 @@ module Bio
|
|
925
974
|
queue = java.util.concurrent.LinkedBlockingQueue.new(128)
|
926
975
|
worker = Thread.new do
|
927
976
|
begin
|
977
|
+
LOG.debug "Starting parse worker."
|
928
978
|
until at_end
|
929
979
|
block = _parse_block()
|
930
980
|
queue.put(block) if block
|
931
981
|
end
|
932
982
|
queue.put(:eof)
|
933
|
-
|
934
|
-
|
983
|
+
LOG.debug { "Parse worker reached EOF." }
|
984
|
+
rescue Exception
|
935
985
|
LOG.error $!
|
986
|
+
Thread.current[:exception] = $!
|
987
|
+
raise
|
936
988
|
end
|
937
989
|
end
|
938
990
|
saw_eof = false
|
@@ -946,12 +998,15 @@ module Bio
|
|
946
998
|
yield block
|
947
999
|
else
|
948
1000
|
# timed out
|
949
|
-
|
1001
|
+
unless worker.alive?
|
1002
|
+
LOG.debug "Worker has exited."
|
1003
|
+
n_final_poll += 1
|
1004
|
+
end
|
950
1005
|
end
|
951
1006
|
break if n_final_poll > 1
|
952
1007
|
end
|
953
1008
|
unless saw_eof
|
954
|
-
raise "worker exited unexpectedly!"
|
1009
|
+
raise "worker exited unexpectedly from #{worker[:exception]}!"
|
955
1010
|
end
|
956
1011
|
end
|
957
1012
|
|
@@ -1000,7 +1055,7 @@ module Bio
|
|
1000
1055
|
|
1001
1056
|
def handle_logging_options(opts)
|
1002
1057
|
opts.on("--logger filename", String,
|
1003
|
-
"Log to file (default
|
1058
|
+
"Log to file (default STDERR)") do |name|
|
1004
1059
|
Bio::Log::CLI.logger(name)
|
1005
1060
|
end
|
1006
1061
|
opts.on("--trace options", String,
|