bio-maf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/lib/bio/maf.rb +2 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +58 -3
data/lib/bio/maf/parser.rb
CHANGED
@@ -9,142 +9,6 @@ module Bio
|
|
9
9
|
# @api public
|
10
10
|
class ParseError < Exception; end
|
11
11
|
|
12
|
-
# A MAF header, containing the variable-value pairs from the first
|
13
|
-
# line of the file as well as the alignment parameters.
|
14
|
-
# @api public
|
15
|
-
class Header
|
16
|
-
# Variable-value pairs from the ##maf line
|
17
|
-
# @return [Hash]
|
18
|
-
attr_accessor :vars
|
19
|
-
# Alignment parameters from the MAF header.
|
20
|
-
# @return [Hash]
|
21
|
-
attr_accessor :alignment_params
|
22
|
-
|
23
|
-
def initialize(vars, params)
|
24
|
-
@vars = vars
|
25
|
-
@alignment_params = params
|
26
|
-
end
|
27
|
-
|
28
|
-
# The required version parameter.
|
29
|
-
# @return [String]
|
30
|
-
def version
|
31
|
-
vars[:version]
|
32
|
-
end
|
33
|
-
|
34
|
-
# The optional scoring parameter, if present.
|
35
|
-
# @return [String]
|
36
|
-
def scoring
|
37
|
-
vars[:scoring]
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
# A MAF alignment block.
|
43
|
-
# @api public
|
44
|
-
class Block
|
45
|
-
# Parameters from the 'a' line starting the alignment block.
|
46
|
-
attr_reader :vars
|
47
|
-
# Sequences, one per 's' or 'e' line.
|
48
|
-
# @return [Array<Sequence>]
|
49
|
-
attr_reader :sequences
|
50
|
-
# Offset of the alignment block within the MAF file, in bytes.
|
51
|
-
# @return [Integer]
|
52
|
-
attr_reader :offset
|
53
|
-
# Size of the alignment block within the MAF file, in bytes.
|
54
|
-
# @return [Integer]
|
55
|
-
attr_reader :size
|
56
|
-
|
57
|
-
def initialize(*args)
|
58
|
-
@vars, @sequences, @offset, @size = args
|
59
|
-
end
|
60
|
-
|
61
|
-
def raw_seq(i)
|
62
|
-
sequences.fetch(i)
|
63
|
-
end
|
64
|
-
|
65
|
-
def each_raw_seq
|
66
|
-
sequences.each { |s| yield s }
|
67
|
-
end
|
68
|
-
|
69
|
-
# Text size of the alignment block. This is the number of text
|
70
|
-
# characters in each line of sequence data, including dashes and
|
71
|
-
# other gaps in the sequence.
|
72
|
-
def text_size
|
73
|
-
sequences.first.text.size
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
77
|
-
|
78
|
-
# A sequence within an alignment block.
|
79
|
-
# @api public
|
80
|
-
class Sequence
|
81
|
-
# @return [String] Source sequence name.
|
82
|
-
attr_reader :source
|
83
|
-
# @return [Integer] Zero-based start position.
|
84
|
-
attr_reader :start
|
85
|
-
# @return [Integer] Size of aligning region in source sequence.
|
86
|
-
attr_reader :size
|
87
|
-
# :+ or :-, indicating which strand the alignment is to.
|
88
|
-
# @return [Symbol]
|
89
|
-
attr_reader :strand
|
90
|
-
# Size of the entire source sequence, not just the aligning
|
91
|
-
# region.
|
92
|
-
# @return [Integer]
|
93
|
-
attr_reader :src_size
|
94
|
-
# Sequence data for the alignment, including insertions.
|
95
|
-
# @return [String]
|
96
|
-
attr_reader :text
|
97
|
-
# Array of raw synteny information from 'i' line.
|
98
|
-
# @return [Array<String>]
|
99
|
-
attr_accessor :i_data
|
100
|
-
# Quality string from 'q' line.
|
101
|
-
# @return [String]
|
102
|
-
attr_accessor :quality
|
103
|
-
alias_method :source_size, :src_size
|
104
|
-
|
105
|
-
def initialize(*args)
|
106
|
-
@source, @start, @size, @strand, @src_size, @text = args
|
107
|
-
end
|
108
|
-
|
109
|
-
# Whether this sequence is empty. Only true for {EmptySequence}
|
110
|
-
# instances from 'e' lines.
|
111
|
-
def empty?
|
112
|
-
false
|
113
|
-
end
|
114
|
-
|
115
|
-
def write_fasta(writer)
|
116
|
-
writer.write("#{source}:#{start}-#{start + size}",
|
117
|
-
text)
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# An empty sequence record from an 'e' line.
|
122
|
-
#
|
123
|
-
# This indicates that "there isn't aligning DNA for a species but
|
124
|
-
# that the current block is bridged by a chain that connects
|
125
|
-
# blocks before and after this block" (MAF spec).
|
126
|
-
# @api public
|
127
|
-
class EmptySequence < Sequence
|
128
|
-
attr_reader :status
|
129
|
-
|
130
|
-
def initialize(*args)
|
131
|
-
super(*args[0..4])
|
132
|
-
@status = args[5]
|
133
|
-
end
|
134
|
-
|
135
|
-
def text
|
136
|
-
''
|
137
|
-
end
|
138
|
-
|
139
|
-
def empty?
|
140
|
-
true
|
141
|
-
end
|
142
|
-
|
143
|
-
def write_fasta(writer)
|
144
|
-
raise "empty sequence output not implemented!"
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
12
|
# Reads MAF files in chunks.
|
149
13
|
# @api private
|
150
14
|
class ChunkReader
|
@@ -399,16 +263,25 @@ module Bio
|
|
399
263
|
payload = s.rest
|
400
264
|
s.pos = s.string.size # jump to EOS
|
401
265
|
end
|
266
|
+
filtered = false
|
402
267
|
lines = payload.split("\n")
|
403
268
|
until lines.empty?
|
404
269
|
line = lines.shift
|
405
270
|
first = line.getbyte(0)
|
406
271
|
if first == S
|
407
272
|
seq = parse_seq_line(line, sequence_filter)
|
408
|
-
|
273
|
+
if seq
|
274
|
+
seqs << seq
|
275
|
+
else
|
276
|
+
filtered = true
|
277
|
+
end
|
409
278
|
elsif first == E && parse_empty
|
410
279
|
e_seq = parse_empty_line(line, sequence_filter)
|
411
|
-
|
280
|
+
if e_seq
|
281
|
+
seqs << e_seq
|
282
|
+
else
|
283
|
+
filtered = true
|
284
|
+
end
|
412
285
|
elsif first == I && parse_extended
|
413
286
|
parts = line.split
|
414
287
|
parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
|
@@ -423,10 +296,19 @@ module Bio
|
|
423
296
|
parse_error "unexpected line: '#{line}'"
|
424
297
|
end
|
425
298
|
end
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
299
|
+
block = Block.new(block_vars,
|
300
|
+
seqs,
|
301
|
+
block_offset,
|
302
|
+
s.pos - block_start_pos,
|
303
|
+
filtered)
|
304
|
+
postprocess_block(block)
|
305
|
+
end
|
306
|
+
|
307
|
+
def postprocess_block(block)
|
308
|
+
if block.filtered? && opts[:remove_gaps]
|
309
|
+
block.remove_gaps!
|
310
|
+
end
|
311
|
+
block
|
430
312
|
end
|
431
313
|
|
432
314
|
# Parse an 's' line.
|
@@ -503,12 +385,13 @@ module Bio
|
|
503
385
|
# A MAF parsing context, used for random-access parsing.
|
504
386
|
class ParseContext
|
505
387
|
include MAFParsing
|
506
|
-
attr_accessor :f, :s, :cr, :parser
|
388
|
+
attr_accessor :f, :s, :cr, :parser, :opts
|
507
389
|
attr_accessor :chunk_start, :last_block_pos, :at_end
|
508
390
|
|
509
|
-
def initialize(fd, chunk_size, parser
|
391
|
+
def initialize(fd, chunk_size, parser)
|
510
392
|
@f = fd
|
511
393
|
@parser = parser
|
394
|
+
@opts = parser.opts
|
512
395
|
reader = opts[:chunk_reader] || ChunkReader
|
513
396
|
@cr = reader.new(@f, chunk_size)
|
514
397
|
@last_block_pos = -1
|
@@ -580,6 +463,7 @@ module Bio
|
|
580
463
|
#
|
581
464
|
# * `:parse_extended`: whether to parse 'i' and 'q' lines
|
582
465
|
# * `:parse_empty`: whether to parse 'e' lines
|
466
|
+
# * `:remove_gaps`: remove gaps left after filtering sequences
|
583
467
|
# * `:chunk_size`: read MAF file in chunks of this many bytes
|
584
468
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
585
469
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
@@ -611,9 +495,6 @@ module Bio
|
|
611
495
|
attr_reader :chunk_start
|
612
496
|
# @return [Integer] offset of the last block start in this chunk.
|
613
497
|
attr_reader :last_block_pos
|
614
|
-
# Sequence filter to apply.
|
615
|
-
# @api public
|
616
|
-
attr_accessor :sequence_filter
|
617
498
|
|
618
499
|
# @api private
|
619
500
|
attr_accessor :parse_extended
|
@@ -630,6 +511,9 @@ module Bio
|
|
630
511
|
# @api public
|
631
512
|
def initialize(file_spec, opts={})
|
632
513
|
@opts = opts
|
514
|
+
if RUBY_PLATFORM == 'java'
|
515
|
+
opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
|
516
|
+
end
|
633
517
|
chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
|
634
518
|
@random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
|
635
519
|
@merge_max = opts[:merge_max] || MERGE_MAX
|
@@ -654,7 +538,7 @@ module Bio
|
|
654
538
|
def context(chunk_size)
|
655
539
|
# IO#dup calls dup(2) internally, but seems broken on JRuby...
|
656
540
|
fd = File.open(file_spec)
|
657
|
-
ParseContext.new(fd, chunk_size, self
|
541
|
+
ParseContext.new(fd, chunk_size, self)
|
658
542
|
end
|
659
543
|
|
660
544
|
# Execute the given block with a {ParseContext} using the given
|
@@ -671,6 +555,20 @@ module Bio
|
|
671
555
|
end
|
672
556
|
end
|
673
557
|
|
558
|
+
# Sequence filter to apply.
|
559
|
+
# @api public
|
560
|
+
# @return [Hash]
|
561
|
+
def sequence_filter
|
562
|
+
@sequence_filter ||= {}
|
563
|
+
end
|
564
|
+
|
565
|
+
# Set the sequence filter.
|
566
|
+
# @api public
|
567
|
+
# @param [Hash] filter the new filter
|
568
|
+
def sequence_filter=(filter)
|
569
|
+
@sequence_filter = filter
|
570
|
+
end
|
571
|
+
|
674
572
|
# Fetch and parse blocks given by `fetch_list`.
|
675
573
|
#
|
676
574
|
# `fetch_list` should be an array of `[offset, length]` tuples.
|
@@ -723,25 +621,22 @@ module Bio
|
|
723
621
|
# TODO: break entries up into longer runs for more
|
724
622
|
# sequential I/O
|
725
623
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
|
726
|
-
|
624
|
+
ct = CompletionTracker.new(fetch_list)
|
625
|
+
completed = ct.queue
|
727
626
|
threads = []
|
728
|
-
n_threads.times { threads << make_worker(jobs,
|
729
|
-
|
730
|
-
|
731
|
-
while
|
732
|
-
c = completed.poll(
|
733
|
-
|
734
|
-
if threads.find { |t| t.
|
735
|
-
|
736
|
-
else
|
737
|
-
raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
|
738
|
-
end
|
627
|
+
n_threads.times { threads << make_worker(jobs, ct) }
|
628
|
+
|
629
|
+
n_res = 0
|
630
|
+
while n_res < fetch_list.size
|
631
|
+
c = completed.poll(1, java.util.concurrent.TimeUnit::SECONDS)
|
632
|
+
unless c
|
633
|
+
raise "Worker failed!" if threads.find { |t| t.status.nil? }
|
634
|
+
next
|
739
635
|
end
|
740
|
-
raise "worker failed: #{c}" if c.is_a? Exception
|
741
636
|
c.each do |block|
|
742
637
|
y << block
|
743
638
|
end
|
744
|
-
|
639
|
+
n_res += 1
|
745
640
|
end
|
746
641
|
threads.each { |t| t.join }
|
747
642
|
elapsed = Time.now - start
|
@@ -758,26 +653,25 @@ module Bio
|
|
758
653
|
# Create a worker thread for parallel parsing.
|
759
654
|
#
|
760
655
|
# @see #fetch_blocks_merged_parallel
|
761
|
-
def make_worker(jobs,
|
656
|
+
def make_worker(jobs, ct)
|
762
657
|
Thread.new do
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
658
|
+
begin
|
659
|
+
with_context(@random_access_chunk_size) do |ctx|
|
660
|
+
while true
|
661
|
+
req = jobs.poll
|
662
|
+
break unless req
|
768
663
|
n_blocks = req[2].size
|
769
664
|
blocks = ctx.fetch_blocks(*req).to_a
|
770
665
|
if blocks.size != n_blocks
|
771
666
|
raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
|
772
667
|
end
|
773
|
-
|
774
|
-
rescue Exception => e
|
775
|
-
completed.put(e)
|
776
|
-
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
777
|
-
$stderr.puts e.backtrace.join("\n")
|
778
|
-
raise e
|
668
|
+
ct << blocks
|
779
669
|
end
|
780
670
|
end
|
671
|
+
rescue Exception => e
|
672
|
+
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
673
|
+
$stderr.puts e.backtrace.join("\n")
|
674
|
+
raise e
|
781
675
|
end
|
782
676
|
end
|
783
677
|
end
|
@@ -860,14 +754,19 @@ module Bio
|
|
860
754
|
end
|
861
755
|
Enumerator.new do |y|
|
862
756
|
saw_eof = false
|
863
|
-
|
757
|
+
n_final_poll = 0
|
758
|
+
while true
|
864
759
|
block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
|
865
760
|
if block == :eof
|
866
761
|
saw_eof = true
|
867
762
|
break
|
868
763
|
elsif block
|
869
764
|
y << block
|
765
|
+
else
|
766
|
+
# timed out
|
767
|
+
n_final_poll += 1 unless worker.alive?
|
870
768
|
end
|
769
|
+
break if n_final_poll > 1
|
871
770
|
end
|
872
771
|
unless saw_eof
|
873
772
|
raise "worker exited unexpectedly!"
|
@@ -883,6 +782,47 @@ module Bio
|
|
883
782
|
|
884
783
|
end
|
885
784
|
|
785
|
+
class CompletionTracker
|
786
|
+
attr_reader :queue, :offsets, :delayed
|
787
|
+
|
788
|
+
def initialize(fetch_list)
|
789
|
+
@offsets = fetch_list.collect { |e| e[0] }
|
790
|
+
@queue = java.util.concurrent.LinkedBlockingQueue.new(128)
|
791
|
+
@delayed = {}
|
792
|
+
@sem = Mutex.new
|
793
|
+
end
|
794
|
+
|
795
|
+
def next_expected
|
796
|
+
offsets.first
|
797
|
+
end
|
798
|
+
|
799
|
+
def <<(blocks)
|
800
|
+
@sem.synchronize do
|
801
|
+
f_offset = blocks.first.offset
|
802
|
+
if f_offset == next_expected
|
803
|
+
offsets.shift
|
804
|
+
queue.put(blocks)
|
805
|
+
drain_delayed
|
806
|
+
else
|
807
|
+
# out of order
|
808
|
+
delayed[f_offset] = blocks
|
809
|
+
end
|
810
|
+
end
|
811
|
+
end
|
812
|
+
|
813
|
+
def drain_delayed
|
814
|
+
while e = delayed.delete(next_expected)
|
815
|
+
offsets.shift
|
816
|
+
queue.put(e)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
end
|
820
|
+
|
821
|
+
# Exposes parser internals for unit tests.
|
822
|
+
class DummyParser
|
823
|
+
include MAFParsing
|
824
|
+
end
|
825
|
+
|
886
826
|
end
|
887
827
|
|
888
828
|
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module Bio::MAF
|
4
|
+
|
5
|
+
# Tiles a given genomic interval.
|
6
|
+
# Inspired by: lib/bx/align/tools/tile.py in bx-python
|
7
|
+
|
8
|
+
class Tiler
|
9
|
+
|
10
|
+
attr_accessor :index
|
11
|
+
attr_accessor :parser
|
12
|
+
attr_accessor :reference
|
13
|
+
# GenomicInterval
|
14
|
+
attr_accessor :interval
|
15
|
+
attr_accessor :species
|
16
|
+
attr_accessor :species_map
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@species_map = {}
|
20
|
+
end
|
21
|
+
|
22
|
+
def ref_data(range)
|
23
|
+
if reference
|
24
|
+
if reference.respond_to? :read_interval
|
25
|
+
reference.read_interval(range.begin, range.end)
|
26
|
+
elsif reference.is_a? String
|
27
|
+
reference.slice(range)
|
28
|
+
else
|
29
|
+
raise "Unhandled reference data source: #{reference}"
|
30
|
+
end
|
31
|
+
else
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def tile
|
37
|
+
parser.sequence_filter[:only_species] = @species
|
38
|
+
# TODO: remove gaps
|
39
|
+
blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
|
40
|
+
mask = Array.new(interval.length, :ref)
|
41
|
+
i_start = interval.zero_start
|
42
|
+
i_end = interval.zero_end
|
43
|
+
if reference
|
44
|
+
ref_region = ref_data(i_start...i_end)
|
45
|
+
end
|
46
|
+
blocks.each do |block|
|
47
|
+
ref = block.ref_seq
|
48
|
+
slice_start = [i_start, ref.start].max
|
49
|
+
slice_end = [i_end, ref.end].min
|
50
|
+
mask.fill(block,
|
51
|
+
(slice_start - i_start)...(slice_end - i_start))
|
52
|
+
end
|
53
|
+
text = []
|
54
|
+
species.each { |s| text << '' }
|
55
|
+
nonref_text = text[1...text.size]
|
56
|
+
runs(mask) do |range, block|
|
57
|
+
g_range = (range.begin + i_start)...(range.end + i_start)
|
58
|
+
if block == :ref
|
59
|
+
# not covered by an alignment block
|
60
|
+
# use the reference sequence if given, otherwise 'N'
|
61
|
+
range_size = range.end - range.begin
|
62
|
+
text[0] << if ref_region
|
63
|
+
ref_region.slice(range)
|
64
|
+
else
|
65
|
+
'N' * range_size
|
66
|
+
end
|
67
|
+
stars = '*' * range_size
|
68
|
+
nonref_text.each { |t| t << stars }
|
69
|
+
else
|
70
|
+
# covered by an alignment block
|
71
|
+
t_range = block.ref_seq.text_range(g_range)
|
72
|
+
species.each_with_index do |species, i|
|
73
|
+
sp_text = text[i]
|
74
|
+
seq = block.sequences.find { |s| s.source == species || s.species == species }
|
75
|
+
if seq
|
76
|
+
# got alignment text
|
77
|
+
sp_text << seq.text.slice(t_range)
|
78
|
+
else
|
79
|
+
# no alignment for this one here, use '*'
|
80
|
+
sp_text << '*' * (t_range.end - t_range.begin)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
text
|
86
|
+
end
|
87
|
+
|
88
|
+
def write_fasta(f)
|
89
|
+
species.zip(tile()) do |species, text|
|
90
|
+
sp_out = species_map[species] || species
|
91
|
+
f.puts ">#{sp_out}"
|
92
|
+
f.puts text
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def runs(mask)
|
97
|
+
cur = nil
|
98
|
+
cur_start = nil
|
99
|
+
mask.each_with_index do |obj, i|
|
100
|
+
if ! cur.equal?(obj)
|
101
|
+
yield(cur_start...i, cur) if cur
|
102
|
+
cur = obj
|
103
|
+
cur_start = i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
yield(cur_start...mask.size, cur)
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
class FASTARangeReader
|
112
|
+
attr_reader :f, :pos
|
113
|
+
|
114
|
+
def initialize(fspec)
|
115
|
+
if fspec.respond_to? :seek
|
116
|
+
@f = fspec
|
117
|
+
else
|
118
|
+
reader_class = if fspec =~ /.gz$/
|
119
|
+
Zlib::GzipReader
|
120
|
+
else
|
121
|
+
File
|
122
|
+
end
|
123
|
+
@f = reader_class.open(fspec)
|
124
|
+
end
|
125
|
+
position_at_start
|
126
|
+
end
|
127
|
+
|
128
|
+
GT = '>'.getbyte(0)
|
129
|
+
|
130
|
+
def position_at_start
|
131
|
+
first = f.readline
|
132
|
+
raise "expected FASTA comment" unless first =~ /^>/
|
133
|
+
@pos = 0
|
134
|
+
end
|
135
|
+
|
136
|
+
def read_interval(z_start, z_end)
|
137
|
+
if z_start < pos
|
138
|
+
position_at_start
|
139
|
+
end
|
140
|
+
data = ''
|
141
|
+
region_size = z_end - z_start
|
142
|
+
in_region = false
|
143
|
+
f.each_line do |line_raw|
|
144
|
+
if line_raw.getbyte(0) == GT
|
145
|
+
raise "unexpected description line: #{line_raw.inspect}"
|
146
|
+
end
|
147
|
+
line = line_raw.strip
|
148
|
+
end_pos = pos + line.size
|
149
|
+
if (! in_region) && pos <= z_start && z_start < end_pos
|
150
|
+
data << line.slice((z_start - pos)...(line.size))
|
151
|
+
in_region = true
|
152
|
+
elsif in_region
|
153
|
+
need = region_size - data.size
|
154
|
+
if need > line.size
|
155
|
+
data << line
|
156
|
+
else
|
157
|
+
# last line
|
158
|
+
data << line.slice(0, need)
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end
|
162
|
+
@pos = end_pos
|
163
|
+
end
|
164
|
+
return data
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
data/lib/bio/maf.rb
CHANGED
data/man/maf_tile.1
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
.\" generated with Ronn/v0.7.3
|
2
|
+
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
|
+
.
|
4
|
+
.TH "MAF_TILE" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
|
5
|
+
.
|
6
|
+
.SH "NAME"
|
7
|
+
\fBmaf_tile\fR \- synthesize an alignment for a given region
|
8
|
+
.
|
9
|
+
.SH "SYNOPSIS"
|
10
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
|
11
|
+
.
|
12
|
+
.P
|
13
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
|
14
|
+
.
|
15
|
+
.SH "DESCRIPTION"
|
16
|
+
\fBmaf_tile\fR takes a MAF file with index (generated by maf_index(1)), extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
|
17
|
+
.
|
18
|
+
.P
|
19
|
+
If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<start>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
|
20
|
+
.
|
21
|
+
.P
|
22
|
+
Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
|
23
|
+
.
|
24
|
+
.SH "OPTIONS"
|
25
|
+
.
|
26
|
+
.TP
|
27
|
+
\fB\-r\fR, \fB\-\-reference SEQ\fR
|
28
|
+
The FASTA reference sequence file given, which may be gzipped, will be used to fill in any gaps between alignment blocks\.
|
29
|
+
.
|
30
|
+
.TP
|
31
|
+
\fB\-i\fR, \fB\-\-interval BEGIN:END\fR
|
32
|
+
The given zero\-based genomic interval will be used to select alignment blocks from the MAF file\.
|
33
|
+
.
|
34
|
+
.TP
|
35
|
+
\fB\-s\fR, \fB\-\-species SPECIES[:NAME]\fR
|
36
|
+
The given species will be selected for output\. If given as \fBspecies:name\fR, it will appear in the FASTA output as \fIname\fR\.
|
37
|
+
.
|
38
|
+
.TP
|
39
|
+
\fB\-b\fR, \fB\-\-bed BED\fR
|
40
|
+
The given BED file will be used to provide a list of intervals to process\. If present, \fB\-\-interval\fR will be ignored and \fB\-\-output\-base\fR must be given as well\.
|
41
|
+
.
|
42
|
+
.TP
|
43
|
+
\fB\-o\fR, \fB\-\-output\-base BASE\fR
|
44
|
+
The given path will be used as the base name for output files, as described above\.
|
45
|
+
.
|
46
|
+
.SH "EXAMPLES"
|
47
|
+
Generate an alignment of the \fBhg19\fR, \fBpetMar1\fR, and \fBornAna1\fR sequences from \fBchrY\.maf\fR over the interval 14400 to 15000 on the reference sequence of the MAF file\. Fills in gaps from \fBchrY\.refseq\.fa\.gz\fR\. Writes FASTA output to stdout\.
|
48
|
+
.
|
49
|
+
.IP "" 4
|
50
|
+
.
|
51
|
+
.nf
|
52
|
+
|
53
|
+
$ maf_tile \-\-reference ~/maf/chrY\.refseq\.fa\.gz \e
|
54
|
+
\-\-interval 14400:15000 \e
|
55
|
+
\-s hg19:human \-s petMar1 \-s ornAna1 \e
|
56
|
+
chrY\.maf chrY\.kct
|
57
|
+
>human
|
58
|
+
GGGTGACGAAAAGAGCCGA\-\-\-\-\-[\.\.\.]
|
59
|
+
>petMar1
|
60
|
+
gagtgccggggagtgccggggagt[\.\.\.]
|
61
|
+
>ornAna1
|
62
|
+
AGGGATCTGGGAATTCTGG\-\-\-\-\-[\.\.\.]
|
63
|
+
.
|
64
|
+
.fi
|
65
|
+
.
|
66
|
+
.IP "" 0
|
67
|
+
.
|
68
|
+
.P
|
69
|
+
Write out a FASTA file for each interval in the given BED file, prefixed with \fB/tmp/mm8\fR, and without filling in data from a reference sequence:
|
70
|
+
.
|
71
|
+
.IP "" 4
|
72
|
+
.
|
73
|
+
.nf
|
74
|
+
|
75
|
+
$ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
|
76
|
+
\-s mm8:mouse \-s rn4:rat \-s hg18:human \e
|
77
|
+
mm8_chr7_tiny\.maf mm8_chr7_tiny\.kct
|
78
|
+
.
|
79
|
+
.fi
|
80
|
+
.
|
81
|
+
.IP "" 0
|
82
|
+
.
|
83
|
+
.SH "FILES"
|
84
|
+
The output is generated in FASTA format, with one sequence per species\.
|
85
|
+
.
|
86
|
+
.P
|
87
|
+
The input \fImaf\fR file must be a Multiple Alignment Format file\.
|
88
|
+
.
|
89
|
+
.P
|
90
|
+
The \fIindex\fR must be a MAF index built with maf_index(1)\.
|
91
|
+
.
|
92
|
+
.P
|
93
|
+
If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
|
94
|
+
.
|
95
|
+
.SH "ENVIRONMENT"
|
96
|
+
\fBmaf_tile\fR is a Ruby program and relies on ordinary Ruby environment variables\.
|
97
|
+
.
|
98
|
+
.SH "COPYRIGHT"
|
99
|
+
\fBmaf_tile\fR is copyright (C) 2012 Clayton Wheeler\.
|
100
|
+
.
|
101
|
+
.SH "SEE ALSO"
|
102
|
+
maf_index(1), ruby(1)
|
103
|
+
.
|
104
|
+
.IP "\(bu" 4
|
105
|
+
\fIhttps://github\.com/csw/bioruby\-maf/\fR
|
106
|
+
.
|
107
|
+
.IP "" 0
|
108
|
+
|