bio-maf 1.0.0-java → 1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/lib/bio/maf/jobs.rb
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'java' if RUBY_PLATFORM == 'java'
|
3
|
+
|
4
|
+
module Bio::MAF
|
5
|
+
|
6
|
+
module JobRunner
|
7
|
+
def JobRunner.create(n_parallel)
|
8
|
+
if RUBY_PLATFORM == 'java'
|
9
|
+
JThreadRunner.new(n_parallel)
|
10
|
+
else
|
11
|
+
ForkRunner.new(n_parallel)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class ForkRunner
|
17
|
+
|
18
|
+
def initialize(n_parallel)
|
19
|
+
@n_parallel = n_parallel
|
20
|
+
@jobs = []
|
21
|
+
@kids = Set.new
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(&proc)
|
25
|
+
@jobs << proc
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
until @jobs.empty? && @kids.empty?
|
30
|
+
while can_start?
|
31
|
+
start_job
|
32
|
+
end
|
33
|
+
await
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def can_start?
|
40
|
+
(! @jobs.empty?) && @kids.size < @n_parallel
|
41
|
+
end
|
42
|
+
|
43
|
+
def start_job
|
44
|
+
job = @jobs.shift
|
45
|
+
pid = fork()
|
46
|
+
if pid
|
47
|
+
# parent
|
48
|
+
@kids << pid
|
49
|
+
else
|
50
|
+
# child
|
51
|
+
begin
|
52
|
+
job.call()
|
53
|
+
exit 0
|
54
|
+
rescue SystemExit
|
55
|
+
raise
|
56
|
+
rescue Exception
|
57
|
+
LOG.error $!
|
58
|
+
exit 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def await
|
64
|
+
pid = Process.wait
|
65
|
+
unless @kids.delete?(pid)
|
66
|
+
raise "Completion of unexpected job #{pid}!"
|
67
|
+
end
|
68
|
+
if ! $?.success?
|
69
|
+
raise "Job #{pid} failed with status #{status.exitstatus}!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
class JThreadRunner
|
76
|
+
|
77
|
+
def initialize(n_parallel)
|
78
|
+
@n_parallel = n_parallel
|
79
|
+
@exec = java.util.concurrent.Executors.newFixedThreadPool(n_parallel)
|
80
|
+
@ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
|
81
|
+
@n = 0
|
82
|
+
end
|
83
|
+
|
84
|
+
def add(&blk)
|
85
|
+
@ecs.submit(&blk)
|
86
|
+
@n += 1
|
87
|
+
end
|
88
|
+
|
89
|
+
def run
|
90
|
+
seen = 0
|
91
|
+
until seen == @n
|
92
|
+
f = @ecs.take()
|
93
|
+
begin
|
94
|
+
f.get()
|
95
|
+
rescue Exception => e
|
96
|
+
LOG.error e
|
97
|
+
@exec.shutdownNow()
|
98
|
+
raise
|
99
|
+
end
|
100
|
+
seen += 1
|
101
|
+
end
|
102
|
+
@exec.shutdown()
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
module Executor
|
108
|
+
def Executor.create
|
109
|
+
if RUBY_PLATFORM == 'java'
|
110
|
+
JExecutor.new
|
111
|
+
else
|
112
|
+
DummyExecutor.new
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class JExecutor
|
118
|
+
|
119
|
+
def initialize
|
120
|
+
queue = java.util.concurrent.LinkedBlockingQueue.new(8)
|
121
|
+
policy = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
|
122
|
+
@exec = java.util.concurrent.ThreadPoolExecutor.new(1, 1, 1,
|
123
|
+
java.util.concurrent.TimeUnit::MINUTES,
|
124
|
+
queue,
|
125
|
+
policy)
|
126
|
+
@ecs = java.util.concurrent.ExecutorCompletionService.new(@exec)
|
127
|
+
@submitted = 0
|
128
|
+
@completed = 0
|
129
|
+
end
|
130
|
+
|
131
|
+
def submit(&blk)
|
132
|
+
@ecs.submit(&blk)
|
133
|
+
@submitted += 1
|
134
|
+
check_for_errors
|
135
|
+
end
|
136
|
+
|
137
|
+
def check_for_errors
|
138
|
+
while f = @ecs.poll
|
139
|
+
f.get
|
140
|
+
@completed += 1
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def shutdown
|
145
|
+
@exec.shutdown
|
146
|
+
until @completed == @submitted
|
147
|
+
f = @ecs.take
|
148
|
+
f.get
|
149
|
+
@completed += 1
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
class DummyExecutor
|
155
|
+
|
156
|
+
def initialize
|
157
|
+
end
|
158
|
+
|
159
|
+
def submit
|
160
|
+
yield
|
161
|
+
end
|
162
|
+
|
163
|
+
def shutdown
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -58,10 +58,14 @@ module Bio
|
|
58
58
|
attr_reader :sequences
|
59
59
|
# Offset of the alignment block within the MAF file, in bytes.
|
60
60
|
# @return [Integer]
|
61
|
-
|
61
|
+
attr_accessor :offset
|
62
62
|
# Size of the alignment block within the MAF file, in bytes.
|
63
63
|
# @return [Integer]
|
64
64
|
attr_reader :size
|
65
|
+
# Original text of the MAF block. Only available if the
|
66
|
+
# :retain_text parser option is set.
|
67
|
+
# @return [String]
|
68
|
+
attr_accessor :orig_text
|
65
69
|
|
66
70
|
def initialize(vars, sequences, offset, size, filtered)
|
67
71
|
@vars = vars
|
@@ -90,6 +94,10 @@ module Bio
|
|
90
94
|
sequences.first.text.size
|
91
95
|
end
|
92
96
|
|
97
|
+
def upcase!
|
98
|
+
sequences.each { |s| s.upcase! }
|
99
|
+
end
|
100
|
+
|
93
101
|
# Whether this block has been modified by a parser filter.
|
94
102
|
# @return [Boolean]
|
95
103
|
def filtered?
|
@@ -101,6 +109,13 @@ module Bio
|
|
101
109
|
Bio::BioAlignment::Alignment.new(ba_seq)
|
102
110
|
end
|
103
111
|
|
112
|
+
def to_s
|
113
|
+
buf = StringIO.new
|
114
|
+
writer = Writer.new(buf)
|
115
|
+
writer.write_block(self)
|
116
|
+
return buf.string
|
117
|
+
end
|
118
|
+
|
104
119
|
GAP = /-+/
|
105
120
|
|
106
121
|
# Find gaps present in all sequences. These would generally
|
@@ -356,6 +371,10 @@ module Bio
|
|
356
371
|
end
|
357
372
|
end
|
358
373
|
|
374
|
+
def upcase!
|
375
|
+
text.upcase!
|
376
|
+
end
|
377
|
+
|
359
378
|
def to_bio_alignment
|
360
379
|
Bio::BioAlignment::Sequence.new(source, text)
|
361
380
|
end
|
@@ -471,6 +490,10 @@ module Bio
|
|
471
490
|
true
|
472
491
|
end
|
473
492
|
|
493
|
+
def upcase!
|
494
|
+
# no-op
|
495
|
+
end
|
496
|
+
|
474
497
|
def write_fasta(writer)
|
475
498
|
raise "empty sequence output not implemented!"
|
476
499
|
end
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'strscan'
|
2
|
+
require 'zlib'
|
2
3
|
require 'java' if RUBY_PLATFORM == 'java'
|
3
4
|
require 'bio-bgzf'
|
4
5
|
|
@@ -104,6 +105,7 @@ module Bio
|
|
104
105
|
|
105
106
|
# Spawn a read-ahead thread. Called from {#initialize}.
|
106
107
|
def start_read_ahead
|
108
|
+
LOG.debug { "Starting read-ahead thread." }
|
107
109
|
@read_thread = Thread.new { read_ahead }
|
108
110
|
end
|
109
111
|
|
@@ -169,6 +171,7 @@ module Bio
|
|
169
171
|
BLOCK_START = /^(?=a)/
|
170
172
|
BLOCK_START_OR_EOS = /(?:^(?=a))|\z/
|
171
173
|
EOL_OR_EOF = /\n|\z/
|
174
|
+
JRUBY_P = (RUBY_PLATFORM == 'java')
|
172
175
|
|
173
176
|
def set_last_block_pos!
|
174
177
|
@last_block_pos = s.string.rindex(BLOCK_START)
|
@@ -333,14 +336,22 @@ module Bio
|
|
333
336
|
elsif [I, E, Q, COMMENT, nil].include? first
|
334
337
|
next
|
335
338
|
else
|
336
|
-
|
339
|
+
if opts[:strict]
|
340
|
+
parse_error "unexpected line: '#{line}'"
|
341
|
+
else
|
342
|
+
LOG.warn "Ignoring invalid MAF line: '#{line}'"
|
343
|
+
end
|
337
344
|
end
|
338
345
|
end
|
339
|
-
Block.new(block_vars,
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
346
|
+
b = Block.new(block_vars,
|
347
|
+
seqs,
|
348
|
+
block_offset,
|
349
|
+
s.pos - block_start_pos,
|
350
|
+
filtered)
|
351
|
+
if opts[:retain_text]
|
352
|
+
b.orig_text = s.string.slice(block_start_pos...(s.pos))
|
353
|
+
end
|
354
|
+
return b
|
344
355
|
end
|
345
356
|
|
346
357
|
# Parse an 's' line.
|
@@ -504,12 +515,16 @@ module Bio
|
|
504
515
|
# * `:parse_extended`: whether to parse 'i' and 'q' lines
|
505
516
|
# * `:parse_empty`: whether to parse 'e' lines
|
506
517
|
# * `:remove_gaps`: remove gaps left after filtering sequences
|
518
|
+
# * `:join_blocks`: join blocks where possible
|
519
|
+
# * `:upcase`: fold sequence data to upper case
|
507
520
|
# * `:chunk_size`: read MAF file in chunks of this many bytes
|
508
521
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
509
522
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
510
523
|
# random access
|
511
524
|
# * `:threads`: number of threads to use for parallel
|
512
525
|
# parsing. Only useful under JRuby.
|
526
|
+
# * `:strict`: abort on un-parseable lines instead of continuing with
|
527
|
+
# a warning.
|
513
528
|
# @api public
|
514
529
|
|
515
530
|
class Parser
|
@@ -519,8 +534,12 @@ module Bio
|
|
519
534
|
attr_reader :header
|
520
535
|
# @return [String] path of MAF file being parsed.
|
521
536
|
attr_reader :file_spec
|
522
|
-
# @return [
|
537
|
+
# @return [IO] file handle for MAF file.
|
523
538
|
attr_reader :f
|
539
|
+
# May be gzip-compressed.
|
540
|
+
# @return [IO] file handle for physical MAF file.
|
541
|
+
# @api private
|
542
|
+
attr_reader :phys_f
|
524
543
|
# @return [StringScanner] scanner for parsing.
|
525
544
|
attr_reader :s
|
526
545
|
# @return [ChunkReader] ChunkReader.
|
@@ -547,33 +566,47 @@ module Bio
|
|
547
566
|
RANDOM_CHUNK_SIZE = 4096
|
548
567
|
MERGE_MAX = SEQ_CHUNK_SIZE
|
549
568
|
|
569
|
+
DEFAULT_OPTS = {
|
570
|
+
:chunk_size => SEQ_CHUNK_SIZE,
|
571
|
+
:random_chunk_size => RANDOM_CHUNK_SIZE,
|
572
|
+
:merge_max => MERGE_MAX,
|
573
|
+
:parse_extended => false,
|
574
|
+
:parse_empty => false,
|
575
|
+
:readahead_thread => true,
|
576
|
+
:seq_parse_thread => true
|
577
|
+
}
|
578
|
+
if JRUBY_P
|
579
|
+
DEFAULT_OPTS[:threads] = java.lang.Runtime.runtime.availableProcessors
|
580
|
+
end
|
581
|
+
|
550
582
|
# Create a new parser instance.
|
551
583
|
#
|
552
584
|
# @param [String] file_spec path of file to parse.
|
553
|
-
# @param [Hash]
|
585
|
+
# @param [Hash] parse_opts parser options.
|
554
586
|
# @api public
|
555
|
-
def initialize(file_spec,
|
587
|
+
def initialize(file_spec, parse_opts={})
|
588
|
+
opts = DEFAULT_OPTS.merge(parse_opts)
|
556
589
|
@opts = opts
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
@random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
|
562
|
-
@merge_max = opts[:merge_max] || MERGE_MAX
|
563
|
-
@parse_extended = opts[:parse_extended] || false
|
564
|
-
@parse_empty = opts[:parse_empty] || false
|
590
|
+
@random_access_chunk_size = opts[:random_chunk_size]
|
591
|
+
@merge_max = opts[:merge_max]
|
592
|
+
@parse_extended = opts[:parse_extended]
|
593
|
+
@parse_empty = opts[:parse_empty]
|
565
594
|
@chunk_start = 0
|
566
595
|
if file_spec.respond_to? :flush
|
596
|
+
# an IO object
|
567
597
|
# guess what, Pathnames respond to :read...
|
568
598
|
@f = file_spec
|
569
599
|
@file_spec = @f.path if @f.respond_to?(:path)
|
570
|
-
# TODO: gzip?
|
600
|
+
# TODO: test for gzip?
|
571
601
|
else
|
602
|
+
# a pathname (or Pathname)
|
572
603
|
@file_spec = file_spec
|
604
|
+
@phys_f = File.open(file_spec)
|
573
605
|
if file_spec.to_s.end_with?(".maf.gz")
|
574
|
-
@f =
|
606
|
+
@f = Zlib::GzipReader.new(@phys_f)
|
607
|
+
@compression = :gzip
|
575
608
|
else
|
576
|
-
@f =
|
609
|
+
@f = @phys_f
|
577
610
|
end
|
578
611
|
end
|
579
612
|
if @file_spec.to_s =~ /\.bgzf?$/
|
@@ -582,8 +615,9 @@ module Bio
|
|
582
615
|
else
|
583
616
|
@base_reader = ChunkReader
|
584
617
|
end
|
585
|
-
@cr = base_reader.new(@f, chunk_size)
|
586
|
-
if
|
618
|
+
@cr = base_reader.new(@f, opts[:chunk_size])
|
619
|
+
if JRUBY_P && opts[:readahead_thread]
|
620
|
+
LOG.debug "Using ThreadedChunkReaderWrapper."
|
587
621
|
@cr = ThreadedChunkReaderWrapper.new(@cr)
|
588
622
|
end
|
589
623
|
@s = StringScanner.new(cr.read_chunk())
|
@@ -649,7 +683,7 @@ module Bio
|
|
649
683
|
def fetch_blocks(fetch_list, &blk)
|
650
684
|
if blk
|
651
685
|
merged = merge_fetch_list(fetch_list)
|
652
|
-
if
|
686
|
+
if JRUBY_P && @opts.fetch(:threads, 1) > 1
|
653
687
|
fun = lambda { |&b2| fetch_blocks_merged_parallel(merged, &b2) }
|
654
688
|
else
|
655
689
|
fun = lambda { |&b2| fetch_blocks_merged(merged, &b2) }
|
@@ -667,15 +701,17 @@ module Bio
|
|
667
701
|
def fetch_blocks_merged(fetch_list, &blk)
|
668
702
|
start = Time.now
|
669
703
|
total_size = fetch_list.collect { |e| e[1] }.reduce(:+)
|
704
|
+
count = 0
|
670
705
|
with_context(@random_access_chunk_size) do |ctx|
|
671
706
|
fetch_list.each do |e|
|
672
707
|
ctx.fetch_blocks(*e, &blk)
|
708
|
+
count += 1
|
673
709
|
end
|
674
710
|
end
|
675
711
|
elapsed = Time.now - start
|
676
712
|
rate = (total_size / 1048576.0) / elapsed
|
677
|
-
LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
|
678
|
-
elapsed, rate) }
|
713
|
+
LOG.debug { sprintf("Fetched %d blocks in %.3fs, %.1f MB/s.",
|
714
|
+
count, elapsed, rate) }
|
679
715
|
end
|
680
716
|
|
681
717
|
# Fetch and parse the blocks given by the merged fetch list, in
|
@@ -807,7 +843,9 @@ module Bio
|
|
807
843
|
end
|
808
844
|
end
|
809
845
|
@header = Header.new(vars, align_params)
|
810
|
-
s.skip_until
|
846
|
+
if ! s.skip_until(BLOCK_START)
|
847
|
+
@at_end = true
|
848
|
+
end
|
811
849
|
end
|
812
850
|
|
813
851
|
# Parse all alignment blocks until EOF.
|
@@ -820,7 +858,7 @@ module Bio
|
|
820
858
|
# @api public
|
821
859
|
def each_block(&blk)
|
822
860
|
if block_given?
|
823
|
-
if
|
861
|
+
if JRUBY_P && opts[:seq_parse_thread]
|
824
862
|
fun = method(:parse_blocks_parallel)
|
825
863
|
else
|
826
864
|
fun = method(:each_block_seq)
|
@@ -847,11 +885,12 @@ module Bio
|
|
847
885
|
b
|
848
886
|
end
|
849
887
|
|
850
|
-
WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps]
|
888
|
+
WRAP_OPTS = [:as_bio_alignment, :join_blocks, :remove_gaps, :upcase]
|
851
889
|
|
852
890
|
def wrap_block_seq(fun, &blk)
|
853
891
|
opts = WRAP_OPTS.find_all { |o| @opts[o] }
|
854
892
|
opts << :sequence_filter if sequence_filter && (! sequence_filter.empty?)
|
893
|
+
LOG.debug { "wrapping #{fun} with #{opts.inspect}" }
|
855
894
|
_wrap(opts, fun, &blk)
|
856
895
|
end
|
857
896
|
|
@@ -873,6 +912,12 @@ module Bio
|
|
873
912
|
fun,
|
874
913
|
:to_bio_alignment,
|
875
914
|
&blk)
|
915
|
+
when :upcase
|
916
|
+
conv_send(options,
|
917
|
+
fun,
|
918
|
+
:upcase!,
|
919
|
+
true,
|
920
|
+
&blk)
|
876
921
|
when :remove_gaps
|
877
922
|
conv_map(options,
|
878
923
|
fun,
|
@@ -910,10 +955,14 @@ module Bio
|
|
910
955
|
end
|
911
956
|
end
|
912
957
|
|
913
|
-
def conv_send(options, search, sym)
|
958
|
+
def conv_send(options, search, sym, always_yield_block=false)
|
914
959
|
_wrap(options, search) do |block|
|
915
960
|
v = block.send(sym)
|
916
|
-
|
961
|
+
if always_yield_block
|
962
|
+
yield block
|
963
|
+
else
|
964
|
+
yield v if v
|
965
|
+
end
|
917
966
|
end
|
918
967
|
end
|
919
968
|
|
@@ -925,14 +974,17 @@ module Bio
|
|
925
974
|
queue = java.util.concurrent.LinkedBlockingQueue.new(128)
|
926
975
|
worker = Thread.new do
|
927
976
|
begin
|
977
|
+
LOG.debug "Starting parse worker."
|
928
978
|
until at_end
|
929
979
|
block = _parse_block()
|
930
980
|
queue.put(block) if block
|
931
981
|
end
|
932
982
|
queue.put(:eof)
|
933
|
-
|
934
|
-
|
983
|
+
LOG.debug { "Parse worker reached EOF." }
|
984
|
+
rescue Exception
|
935
985
|
LOG.error $!
|
986
|
+
Thread.current[:exception] = $!
|
987
|
+
raise
|
936
988
|
end
|
937
989
|
end
|
938
990
|
saw_eof = false
|
@@ -946,12 +998,15 @@ module Bio
|
|
946
998
|
yield block
|
947
999
|
else
|
948
1000
|
# timed out
|
949
|
-
|
1001
|
+
unless worker.alive?
|
1002
|
+
LOG.debug "Worker has exited."
|
1003
|
+
n_final_poll += 1
|
1004
|
+
end
|
950
1005
|
end
|
951
1006
|
break if n_final_poll > 1
|
952
1007
|
end
|
953
1008
|
unless saw_eof
|
954
|
-
raise "worker exited unexpectedly!"
|
1009
|
+
raise "worker exited unexpectedly from #{worker[:exception]}!"
|
955
1010
|
end
|
956
1011
|
end
|
957
1012
|
|
@@ -1000,7 +1055,7 @@ module Bio
|
|
1000
1055
|
|
1001
1056
|
def handle_logging_options(opts)
|
1002
1057
|
opts.on("--logger filename", String,
|
1003
|
-
"Log to file (default
|
1058
|
+
"Log to file (default STDERR)") do |name|
|
1004
1059
|
Bio::Log::CLI.logger(name)
|
1005
1060
|
end
|
1006
1061
|
opts.on("--trace options", String,
|