bio-maf 0.1.0-java → 0.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf.rb +2 -0
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +65 -7
data/lib/bio/maf/parser.rb
CHANGED
|
@@ -9,142 +9,6 @@ module Bio
|
|
|
9
9
|
# @api public
|
|
10
10
|
class ParseError < Exception; end
|
|
11
11
|
|
|
12
|
-
# A MAF header, containing the variable-value pairs from the first
|
|
13
|
-
# line of the file as well as the alignment parameters.
|
|
14
|
-
# @api public
|
|
15
|
-
class Header
|
|
16
|
-
# Variable-value pairs from the ##maf line
|
|
17
|
-
# @return [Hash]
|
|
18
|
-
attr_accessor :vars
|
|
19
|
-
# Alignment parameters from the MAF header.
|
|
20
|
-
# @return [Hash]
|
|
21
|
-
attr_accessor :alignment_params
|
|
22
|
-
|
|
23
|
-
def initialize(vars, params)
|
|
24
|
-
@vars = vars
|
|
25
|
-
@alignment_params = params
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
# The required version parameter.
|
|
29
|
-
# @return [String]
|
|
30
|
-
def version
|
|
31
|
-
vars[:version]
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# The optional scoring parameter, if present.
|
|
35
|
-
# @return [String]
|
|
36
|
-
def scoring
|
|
37
|
-
vars[:scoring]
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# A MAF alignment block.
|
|
43
|
-
# @api public
|
|
44
|
-
class Block
|
|
45
|
-
# Parameters from the 'a' line starting the alignment block.
|
|
46
|
-
attr_reader :vars
|
|
47
|
-
# Sequences, one per 's' or 'e' line.
|
|
48
|
-
# @return [Array<Sequence>]
|
|
49
|
-
attr_reader :sequences
|
|
50
|
-
# Offset of the alignment block within the MAF file, in bytes.
|
|
51
|
-
# @return [Integer]
|
|
52
|
-
attr_reader :offset
|
|
53
|
-
# Size of the alignment block within the MAF file, in bytes.
|
|
54
|
-
# @return [Integer]
|
|
55
|
-
attr_reader :size
|
|
56
|
-
|
|
57
|
-
def initialize(*args)
|
|
58
|
-
@vars, @sequences, @offset, @size = args
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def raw_seq(i)
|
|
62
|
-
sequences.fetch(i)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
def each_raw_seq
|
|
66
|
-
sequences.each { |s| yield s }
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Text size of the alignment block. This is the number of text
|
|
70
|
-
# characters in each line of sequence data, including dashes and
|
|
71
|
-
# other gaps in the sequence.
|
|
72
|
-
def text_size
|
|
73
|
-
sequences.first.text.size
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
# A sequence within an alignment block.
|
|
79
|
-
# @api public
|
|
80
|
-
class Sequence
|
|
81
|
-
# @return [String] Source sequence name.
|
|
82
|
-
attr_reader :source
|
|
83
|
-
# @return [Integer] Zero-based start position.
|
|
84
|
-
attr_reader :start
|
|
85
|
-
# @return [Integer] Size of aligning region in source sequence.
|
|
86
|
-
attr_reader :size
|
|
87
|
-
# :+ or :-, indicating which strand the alignment is to.
|
|
88
|
-
# @return [Symbol]
|
|
89
|
-
attr_reader :strand
|
|
90
|
-
# Size of the entire source sequence, not just the aligning
|
|
91
|
-
# region.
|
|
92
|
-
# @return [Integer]
|
|
93
|
-
attr_reader :src_size
|
|
94
|
-
# Sequence data for the alignment, including insertions.
|
|
95
|
-
# @return [String]
|
|
96
|
-
attr_reader :text
|
|
97
|
-
# Array of raw synteny information from 'i' line.
|
|
98
|
-
# @return [Array<String>]
|
|
99
|
-
attr_accessor :i_data
|
|
100
|
-
# Quality string from 'q' line.
|
|
101
|
-
# @return [String]
|
|
102
|
-
attr_accessor :quality
|
|
103
|
-
alias_method :source_size, :src_size
|
|
104
|
-
|
|
105
|
-
def initialize(*args)
|
|
106
|
-
@source, @start, @size, @strand, @src_size, @text = args
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# Whether this sequence is empty. Only true for {EmptySequence}
|
|
110
|
-
# instances from 'e' lines.
|
|
111
|
-
def empty?
|
|
112
|
-
false
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
def write_fasta(writer)
|
|
116
|
-
writer.write("#{source}:#{start}-#{start + size}",
|
|
117
|
-
text)
|
|
118
|
-
end
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
# An empty sequence record from an 'e' line.
|
|
122
|
-
#
|
|
123
|
-
# This indicates that "there isn't aligning DNA for a species but
|
|
124
|
-
# that the current block is bridged by a chain that connects
|
|
125
|
-
# blocks before and after this block" (MAF spec).
|
|
126
|
-
# @api public
|
|
127
|
-
class EmptySequence < Sequence
|
|
128
|
-
attr_reader :status
|
|
129
|
-
|
|
130
|
-
def initialize(*args)
|
|
131
|
-
super(*args[0..4])
|
|
132
|
-
@status = args[5]
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
def text
|
|
136
|
-
''
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
def empty?
|
|
140
|
-
true
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
def write_fasta(writer)
|
|
144
|
-
raise "empty sequence output not implemented!"
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
|
|
148
12
|
# Reads MAF files in chunks.
|
|
149
13
|
# @api private
|
|
150
14
|
class ChunkReader
|
|
@@ -399,16 +263,25 @@ module Bio
|
|
|
399
263
|
payload = s.rest
|
|
400
264
|
s.pos = s.string.size # jump to EOS
|
|
401
265
|
end
|
|
266
|
+
filtered = false
|
|
402
267
|
lines = payload.split("\n")
|
|
403
268
|
until lines.empty?
|
|
404
269
|
line = lines.shift
|
|
405
270
|
first = line.getbyte(0)
|
|
406
271
|
if first == S
|
|
407
272
|
seq = parse_seq_line(line, sequence_filter)
|
|
408
|
-
|
|
273
|
+
if seq
|
|
274
|
+
seqs << seq
|
|
275
|
+
else
|
|
276
|
+
filtered = true
|
|
277
|
+
end
|
|
409
278
|
elsif first == E && parse_empty
|
|
410
279
|
e_seq = parse_empty_line(line, sequence_filter)
|
|
411
|
-
|
|
280
|
+
if e_seq
|
|
281
|
+
seqs << e_seq
|
|
282
|
+
else
|
|
283
|
+
filtered = true
|
|
284
|
+
end
|
|
412
285
|
elsif first == I && parse_extended
|
|
413
286
|
parts = line.split
|
|
414
287
|
parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
|
|
@@ -423,10 +296,19 @@ module Bio
|
|
|
423
296
|
parse_error "unexpected line: '#{line}'"
|
|
424
297
|
end
|
|
425
298
|
end
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
299
|
+
block = Block.new(block_vars,
|
|
300
|
+
seqs,
|
|
301
|
+
block_offset,
|
|
302
|
+
s.pos - block_start_pos,
|
|
303
|
+
filtered)
|
|
304
|
+
postprocess_block(block)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def postprocess_block(block)
|
|
308
|
+
if block.filtered? && opts[:remove_gaps]
|
|
309
|
+
block.remove_gaps!
|
|
310
|
+
end
|
|
311
|
+
block
|
|
430
312
|
end
|
|
431
313
|
|
|
432
314
|
# Parse an 's' line.
|
|
@@ -503,12 +385,13 @@ module Bio
|
|
|
503
385
|
# A MAF parsing context, used for random-access parsing.
|
|
504
386
|
class ParseContext
|
|
505
387
|
include MAFParsing
|
|
506
|
-
attr_accessor :f, :s, :cr, :parser
|
|
388
|
+
attr_accessor :f, :s, :cr, :parser, :opts
|
|
507
389
|
attr_accessor :chunk_start, :last_block_pos, :at_end
|
|
508
390
|
|
|
509
|
-
def initialize(fd, chunk_size, parser
|
|
391
|
+
def initialize(fd, chunk_size, parser)
|
|
510
392
|
@f = fd
|
|
511
393
|
@parser = parser
|
|
394
|
+
@opts = parser.opts
|
|
512
395
|
reader = opts[:chunk_reader] || ChunkReader
|
|
513
396
|
@cr = reader.new(@f, chunk_size)
|
|
514
397
|
@last_block_pos = -1
|
|
@@ -580,6 +463,7 @@ module Bio
|
|
|
580
463
|
#
|
|
581
464
|
# * `:parse_extended`: whether to parse 'i' and 'q' lines
|
|
582
465
|
# * `:parse_empty`: whether to parse 'e' lines
|
|
466
|
+
# * `:remove_gaps`: remove gaps left after filtering sequences
|
|
583
467
|
# * `:chunk_size`: read MAF file in chunks of this many bytes
|
|
584
468
|
# * `:random_chunk_size`: as above, but for random access ({#fetch_blocks})
|
|
585
469
|
# * `:merge_max`: merge up to this many bytes of blocks for
|
|
@@ -611,9 +495,6 @@ module Bio
|
|
|
611
495
|
attr_reader :chunk_start
|
|
612
496
|
# @return [Integer] offset of the last block start in this chunk.
|
|
613
497
|
attr_reader :last_block_pos
|
|
614
|
-
# Sequence filter to apply.
|
|
615
|
-
# @api public
|
|
616
|
-
attr_accessor :sequence_filter
|
|
617
498
|
|
|
618
499
|
# @api private
|
|
619
500
|
attr_accessor :parse_extended
|
|
@@ -630,6 +511,9 @@ module Bio
|
|
|
630
511
|
# @api public
|
|
631
512
|
def initialize(file_spec, opts={})
|
|
632
513
|
@opts = opts
|
|
514
|
+
if RUBY_PLATFORM == 'java'
|
|
515
|
+
opts[:threads] ||= java.lang.Runtime.runtime.availableProcessors
|
|
516
|
+
end
|
|
633
517
|
chunk_size = opts[:chunk_size] || SEQ_CHUNK_SIZE
|
|
634
518
|
@random_access_chunk_size = opts[:random_chunk_size] || RANDOM_CHUNK_SIZE
|
|
635
519
|
@merge_max = opts[:merge_max] || MERGE_MAX
|
|
@@ -654,7 +538,7 @@ module Bio
|
|
|
654
538
|
def context(chunk_size)
|
|
655
539
|
# IO#dup calls dup(2) internally, but seems broken on JRuby...
|
|
656
540
|
fd = File.open(file_spec)
|
|
657
|
-
ParseContext.new(fd, chunk_size, self
|
|
541
|
+
ParseContext.new(fd, chunk_size, self)
|
|
658
542
|
end
|
|
659
543
|
|
|
660
544
|
# Execute the given block with a {ParseContext} using the given
|
|
@@ -671,6 +555,20 @@ module Bio
|
|
|
671
555
|
end
|
|
672
556
|
end
|
|
673
557
|
|
|
558
|
+
# Sequence filter to apply.
|
|
559
|
+
# @api public
|
|
560
|
+
# @return [Hash]
|
|
561
|
+
def sequence_filter
|
|
562
|
+
@sequence_filter ||= {}
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
# Set the sequence filter.
|
|
566
|
+
# @api public
|
|
567
|
+
# @param [Hash] filter the new filter
|
|
568
|
+
def sequence_filter=(filter)
|
|
569
|
+
@sequence_filter = filter
|
|
570
|
+
end
|
|
571
|
+
|
|
674
572
|
# Fetch and parse blocks given by `fetch_list`.
|
|
675
573
|
#
|
|
676
574
|
# `fetch_list` should be an array of `[offset, length]` tuples.
|
|
@@ -723,25 +621,22 @@ module Bio
|
|
|
723
621
|
# TODO: break entries up into longer runs for more
|
|
724
622
|
# sequential I/O
|
|
725
623
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(fetch_list)
|
|
726
|
-
|
|
624
|
+
ct = CompletionTracker.new(fetch_list)
|
|
625
|
+
completed = ct.queue
|
|
727
626
|
threads = []
|
|
728
|
-
n_threads.times { threads << make_worker(jobs,
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
while
|
|
732
|
-
c = completed.poll(
|
|
733
|
-
|
|
734
|
-
if threads.find { |t| t.
|
|
735
|
-
|
|
736
|
-
else
|
|
737
|
-
raise "No threads alive, completed #{n_completed}/#{fetch_list.size} jobs!"
|
|
738
|
-
end
|
|
627
|
+
n_threads.times { threads << make_worker(jobs, ct) }
|
|
628
|
+
|
|
629
|
+
n_res = 0
|
|
630
|
+
while n_res < fetch_list.size
|
|
631
|
+
c = completed.poll(1, java.util.concurrent.TimeUnit::SECONDS)
|
|
632
|
+
unless c
|
|
633
|
+
raise "Worker failed!" if threads.find { |t| t.status.nil? }
|
|
634
|
+
next
|
|
739
635
|
end
|
|
740
|
-
raise "worker failed: #{c}" if c.is_a? Exception
|
|
741
636
|
c.each do |block|
|
|
742
637
|
y << block
|
|
743
638
|
end
|
|
744
|
-
|
|
639
|
+
n_res += 1
|
|
745
640
|
end
|
|
746
641
|
threads.each { |t| t.join }
|
|
747
642
|
elapsed = Time.now - start
|
|
@@ -758,26 +653,25 @@ module Bio
|
|
|
758
653
|
# Create a worker thread for parallel parsing.
|
|
759
654
|
#
|
|
760
655
|
# @see #fetch_blocks_merged_parallel
|
|
761
|
-
def make_worker(jobs,
|
|
656
|
+
def make_worker(jobs, ct)
|
|
762
657
|
Thread.new do
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
658
|
+
begin
|
|
659
|
+
with_context(@random_access_chunk_size) do |ctx|
|
|
660
|
+
while true
|
|
661
|
+
req = jobs.poll
|
|
662
|
+
break unless req
|
|
768
663
|
n_blocks = req[2].size
|
|
769
664
|
blocks = ctx.fetch_blocks(*req).to_a
|
|
770
665
|
if blocks.size != n_blocks
|
|
771
666
|
raise "expected #{n_blocks}, got #{blocks.size}: #{e.inspect}"
|
|
772
667
|
end
|
|
773
|
-
|
|
774
|
-
rescue Exception => e
|
|
775
|
-
completed.put(e)
|
|
776
|
-
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
|
777
|
-
$stderr.puts e.backtrace.join("\n")
|
|
778
|
-
raise e
|
|
668
|
+
ct << blocks
|
|
779
669
|
end
|
|
780
670
|
end
|
|
671
|
+
rescue Exception => e
|
|
672
|
+
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
|
673
|
+
$stderr.puts e.backtrace.join("\n")
|
|
674
|
+
raise e
|
|
781
675
|
end
|
|
782
676
|
end
|
|
783
677
|
end
|
|
@@ -860,14 +754,19 @@ module Bio
|
|
|
860
754
|
end
|
|
861
755
|
Enumerator.new do |y|
|
|
862
756
|
saw_eof = false
|
|
863
|
-
|
|
757
|
+
n_final_poll = 0
|
|
758
|
+
while true
|
|
864
759
|
block = queue.poll(1, java.util.concurrent.TimeUnit::SECONDS)
|
|
865
760
|
if block == :eof
|
|
866
761
|
saw_eof = true
|
|
867
762
|
break
|
|
868
763
|
elsif block
|
|
869
764
|
y << block
|
|
765
|
+
else
|
|
766
|
+
# timed out
|
|
767
|
+
n_final_poll += 1 unless worker.alive?
|
|
870
768
|
end
|
|
769
|
+
break if n_final_poll > 1
|
|
871
770
|
end
|
|
872
771
|
unless saw_eof
|
|
873
772
|
raise "worker exited unexpectedly!"
|
|
@@ -883,6 +782,47 @@ module Bio
|
|
|
883
782
|
|
|
884
783
|
end
|
|
885
784
|
|
|
785
|
+
class CompletionTracker
|
|
786
|
+
attr_reader :queue, :offsets, :delayed
|
|
787
|
+
|
|
788
|
+
def initialize(fetch_list)
|
|
789
|
+
@offsets = fetch_list.collect { |e| e[0] }
|
|
790
|
+
@queue = java.util.concurrent.LinkedBlockingQueue.new(128)
|
|
791
|
+
@delayed = {}
|
|
792
|
+
@sem = Mutex.new
|
|
793
|
+
end
|
|
794
|
+
|
|
795
|
+
def next_expected
|
|
796
|
+
offsets.first
|
|
797
|
+
end
|
|
798
|
+
|
|
799
|
+
def <<(blocks)
|
|
800
|
+
@sem.synchronize do
|
|
801
|
+
f_offset = blocks.first.offset
|
|
802
|
+
if f_offset == next_expected
|
|
803
|
+
offsets.shift
|
|
804
|
+
queue.put(blocks)
|
|
805
|
+
drain_delayed
|
|
806
|
+
else
|
|
807
|
+
# out of order
|
|
808
|
+
delayed[f_offset] = blocks
|
|
809
|
+
end
|
|
810
|
+
end
|
|
811
|
+
end
|
|
812
|
+
|
|
813
|
+
def drain_delayed
|
|
814
|
+
while e = delayed.delete(next_expected)
|
|
815
|
+
offsets.shift
|
|
816
|
+
queue.put(e)
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
end
|
|
820
|
+
|
|
821
|
+
# Exposes parser internals for unit tests.
|
|
822
|
+
class DummyParser
|
|
823
|
+
include MAFParsing
|
|
824
|
+
end
|
|
825
|
+
|
|
886
826
|
end
|
|
887
827
|
|
|
888
828
|
end
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
require 'zlib'
|
|
2
|
+
|
|
3
|
+
module Bio::MAF
|
|
4
|
+
|
|
5
|
+
# Tiles a given genomic interval.
|
|
6
|
+
# Inspired by: lib/bx/align/tools/tile.py in bx-python
|
|
7
|
+
|
|
8
|
+
class Tiler
|
|
9
|
+
|
|
10
|
+
attr_accessor :index
|
|
11
|
+
attr_accessor :parser
|
|
12
|
+
attr_accessor :reference
|
|
13
|
+
# GenomicInterval
|
|
14
|
+
attr_accessor :interval
|
|
15
|
+
attr_accessor :species
|
|
16
|
+
attr_accessor :species_map
|
|
17
|
+
|
|
18
|
+
def initialize
|
|
19
|
+
@species_map = {}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def ref_data(range)
|
|
23
|
+
if reference
|
|
24
|
+
if reference.respond_to? :read_interval
|
|
25
|
+
reference.read_interval(range.begin, range.end)
|
|
26
|
+
elsif reference.is_a? String
|
|
27
|
+
reference.slice(range)
|
|
28
|
+
else
|
|
29
|
+
raise "Unhandled reference data source: #{reference}"
|
|
30
|
+
end
|
|
31
|
+
else
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def tile
|
|
37
|
+
parser.sequence_filter[:only_species] = @species
|
|
38
|
+
# TODO: remove gaps
|
|
39
|
+
blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
|
|
40
|
+
mask = Array.new(interval.length, :ref)
|
|
41
|
+
i_start = interval.zero_start
|
|
42
|
+
i_end = interval.zero_end
|
|
43
|
+
if reference
|
|
44
|
+
ref_region = ref_data(i_start...i_end)
|
|
45
|
+
end
|
|
46
|
+
blocks.each do |block|
|
|
47
|
+
ref = block.ref_seq
|
|
48
|
+
slice_start = [i_start, ref.start].max
|
|
49
|
+
slice_end = [i_end, ref.end].min
|
|
50
|
+
mask.fill(block,
|
|
51
|
+
(slice_start - i_start)...(slice_end - i_start))
|
|
52
|
+
end
|
|
53
|
+
text = []
|
|
54
|
+
species.each { |s| text << '' }
|
|
55
|
+
nonref_text = text[1...text.size]
|
|
56
|
+
runs(mask) do |range, block|
|
|
57
|
+
g_range = (range.begin + i_start)...(range.end + i_start)
|
|
58
|
+
if block == :ref
|
|
59
|
+
# not covered by an alignment block
|
|
60
|
+
# use the reference sequence if given, otherwise 'N'
|
|
61
|
+
range_size = range.end - range.begin
|
|
62
|
+
text[0] << if ref_region
|
|
63
|
+
ref_region.slice(range)
|
|
64
|
+
else
|
|
65
|
+
'N' * range_size
|
|
66
|
+
end
|
|
67
|
+
stars = '*' * range_size
|
|
68
|
+
nonref_text.each { |t| t << stars }
|
|
69
|
+
else
|
|
70
|
+
# covered by an alignment block
|
|
71
|
+
t_range = block.ref_seq.text_range(g_range)
|
|
72
|
+
species.each_with_index do |species, i|
|
|
73
|
+
sp_text = text[i]
|
|
74
|
+
seq = block.sequences.find { |s| s.source == species || s.species == species }
|
|
75
|
+
if seq
|
|
76
|
+
# got alignment text
|
|
77
|
+
sp_text << seq.text.slice(t_range)
|
|
78
|
+
else
|
|
79
|
+
# no alignment for this one here, use '*'
|
|
80
|
+
sp_text << '*' * (t_range.end - t_range.begin)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
text
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def write_fasta(f)
|
|
89
|
+
species.zip(tile()) do |species, text|
|
|
90
|
+
sp_out = species_map[species] || species
|
|
91
|
+
f.puts ">#{sp_out}"
|
|
92
|
+
f.puts text
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def runs(mask)
|
|
97
|
+
cur = nil
|
|
98
|
+
cur_start = nil
|
|
99
|
+
mask.each_with_index do |obj, i|
|
|
100
|
+
if ! cur.equal?(obj)
|
|
101
|
+
yield(cur_start...i, cur) if cur
|
|
102
|
+
cur = obj
|
|
103
|
+
cur_start = i
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
yield(cur_start...mask.size, cur)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
class FASTARangeReader
|
|
112
|
+
attr_reader :f, :pos
|
|
113
|
+
|
|
114
|
+
def initialize(fspec)
|
|
115
|
+
if fspec.respond_to? :seek
|
|
116
|
+
@f = fspec
|
|
117
|
+
else
|
|
118
|
+
reader_class = if fspec =~ /.gz$/
|
|
119
|
+
Zlib::GzipReader
|
|
120
|
+
else
|
|
121
|
+
File
|
|
122
|
+
end
|
|
123
|
+
@f = reader_class.open(fspec)
|
|
124
|
+
end
|
|
125
|
+
position_at_start
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
GT = '>'.getbyte(0)
|
|
129
|
+
|
|
130
|
+
def position_at_start
|
|
131
|
+
first = f.readline
|
|
132
|
+
raise "expected FASTA comment" unless first =~ /^>/
|
|
133
|
+
@pos = 0
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def read_interval(z_start, z_end)
|
|
137
|
+
if z_start < pos
|
|
138
|
+
position_at_start
|
|
139
|
+
end
|
|
140
|
+
data = ''
|
|
141
|
+
region_size = z_end - z_start
|
|
142
|
+
in_region = false
|
|
143
|
+
f.each_line do |line_raw|
|
|
144
|
+
if line_raw.getbyte(0) == GT
|
|
145
|
+
raise "unexpected description line: #{line_raw.inspect}"
|
|
146
|
+
end
|
|
147
|
+
line = line_raw.strip
|
|
148
|
+
end_pos = pos + line.size
|
|
149
|
+
if (! in_region) && pos <= z_start && z_start < end_pos
|
|
150
|
+
data << line.slice((z_start - pos)...(line.size))
|
|
151
|
+
in_region = true
|
|
152
|
+
elsif in_region
|
|
153
|
+
need = region_size - data.size
|
|
154
|
+
if need > line.size
|
|
155
|
+
data << line
|
|
156
|
+
else
|
|
157
|
+
# last line
|
|
158
|
+
data << line.slice(0, need)
|
|
159
|
+
break
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
@pos = end_pos
|
|
163
|
+
end
|
|
164
|
+
return data
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|