bio-maf 0.1.0-java → 0.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf.rb +2 -0
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +65 -7
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Feature: Remove gaps from MAF files
|
|
2
|
+
In order to work with only the alignment data involving sequences
|
|
3
|
+
Which can be used by downstream software
|
|
4
|
+
We may want to filter out certain species
|
|
5
|
+
Which can leave gap regions where sequence data was only present
|
|
6
|
+
For removed species
|
|
7
|
+
So it is useful to be able to remove those gaps
|
|
8
|
+
|
|
9
|
+
Background:
|
|
10
|
+
Given MAF data:
|
|
11
|
+
"""
|
|
12
|
+
##maf version=1
|
|
13
|
+
a score=10542.0
|
|
14
|
+
s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
|
|
15
|
+
s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
|
|
16
|
+
s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
|
|
17
|
+
s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
|
18
|
+
s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
|
19
|
+
s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
|
|
20
|
+
s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
|
|
21
|
+
s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
|
|
22
|
+
s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
|
|
23
|
+
s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
|
|
24
|
+
"""
|
|
25
|
+
When I open it with a MAF reader
|
|
26
|
+
And filter for only the species
|
|
27
|
+
| mm8 |
|
|
28
|
+
| rn4 |
|
|
29
|
+
| hg18 |
|
|
30
|
+
| canFam2 |
|
|
31
|
+
| loxAfr1 |
|
|
32
|
+
|
|
33
|
+
Scenario: Detect filtered blocks
|
|
34
|
+
When an alignment block can be obtained
|
|
35
|
+
Then the alignment block is marked as filtered
|
|
36
|
+
And the alignment block has 5 sequences
|
|
37
|
+
|
|
38
|
+
Scenario: Detect gaps
|
|
39
|
+
When an alignment block can be obtained
|
|
40
|
+
Then 1 gap is found with length [14]
|
|
41
|
+
|
|
42
|
+
Scenario: Remove gaps
|
|
43
|
+
When an alignment block can be obtained
|
|
44
|
+
And gaps are removed
|
|
45
|
+
Then the text size of the block is 40
|
|
46
|
+
|
|
47
|
+
Scenario: Remove gaps in the parser
|
|
48
|
+
When I enable the :remove_gaps parser option
|
|
49
|
+
And an alignment block can be obtained
|
|
50
|
+
Then the text size of the block is 40
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Given /^chromosome reference sequence:$/ do |string|
|
|
2
|
+
sio = StringIO.new(string)
|
|
3
|
+
@refseq = Bio::MAF::FASTARangeReader.new(sio)
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
When /^tile ([^:\s]+):(\d+)-(\d+)( with the chromosome reference)?$/ do |seq, i_start, i_end, ref_p|
|
|
7
|
+
@tiler = Bio::MAF::Tiler.new
|
|
8
|
+
@tiler.index = @idx
|
|
9
|
+
@tiler.parser = @parser
|
|
10
|
+
@tiler.reference = @refseq if ref_p
|
|
11
|
+
@tiler.interval = Bio::GenomicInterval.zero_based(seq,
|
|
12
|
+
i_start.to_i,
|
|
13
|
+
i_end.to_i)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
When /^tile with species \[(.+?)\]$/ do |species_text|
|
|
17
|
+
@tiler.species = species_text.split(/,\s*/)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
When /^map species (\S+) as (\S+)$/ do |sp1, sp2|
|
|
21
|
+
@tiler.species_map[sp1] = sp2
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
When /^write the tiled data as FASTA$/ do
|
|
25
|
+
@dst = Tempfile.new(["cuke", ".fa"])
|
|
26
|
+
@tiler.write_fasta(@dst)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
Then /^the FASTA data obtained should be:$/ do |string|
|
|
30
|
+
@dst.seek(0)
|
|
31
|
+
@dst.read.rstrip.should == string.rstrip
|
|
32
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Then /^the alignment block is marked as filtered$/ do
|
|
2
|
+
@block.filtered?.should be_true
|
|
3
|
+
end
|
|
4
|
+
|
|
5
|
+
Then /^(\d+) gaps? (?:is|are) found with length \[(\d+)\]$/ do |n_gaps, gap_sizes_s|
|
|
6
|
+
gaps = @block.find_gaps
|
|
7
|
+
gaps.size.should == n_gaps.to_i
|
|
8
|
+
e_gap_sizes = gap_sizes_s.split(/,\s*/).collect { |n| n.to_i }
|
|
9
|
+
gap_sizes = gaps.collect { |gap| gap[1] }
|
|
10
|
+
gap_sizes.should == e_gap_sizes
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
When /^gaps are removed$/ do
|
|
14
|
+
@block.remove_gaps!
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
Then /^the text size of the block is (\d+)$/ do |e_text_size|
|
|
18
|
+
@block.text_size.should == e_text_size.to_i
|
|
19
|
+
end
|
data/lib/bio/maf.rb
CHANGED
data/lib/bio/maf/index.rb
CHANGED
|
@@ -65,10 +65,11 @@ module Bio
|
|
|
65
65
|
include KVHelpers
|
|
66
66
|
|
|
67
67
|
attr_reader :db, :species, :species_max_id
|
|
68
|
-
attr_accessor :index_sequences
|
|
68
|
+
attr_accessor :index_sequences, :ref_seq
|
|
69
69
|
|
|
70
70
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
|
71
71
|
FORMAT_VERSION = 2
|
|
72
|
+
REF_SEQ_KEY = 'bio-maf:reference-sequence'
|
|
72
73
|
MAX_SPECIES = 64
|
|
73
74
|
|
|
74
75
|
## Key-value store index format
|
|
@@ -221,6 +222,7 @@ module Bio
|
|
|
221
222
|
raise "Could not open DB file!"
|
|
222
223
|
end
|
|
223
224
|
if mode == KyotoCabinet::DB::OREADER
|
|
225
|
+
self.ref_seq = db[REF_SEQ_KEY]
|
|
224
226
|
load_index_sequences
|
|
225
227
|
load_species
|
|
226
228
|
end
|
|
@@ -309,11 +311,12 @@ module Bio
|
|
|
309
311
|
end
|
|
310
312
|
ready = Time.now
|
|
311
313
|
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
|
312
|
-
if RUBY_PLATFORM == 'java'
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
314
|
+
matches = if RUBY_PLATFORM == 'java'
|
|
315
|
+
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
|
316
|
+
else
|
|
317
|
+
scan_bins(chrom_id, bin_intervals, filters)
|
|
318
|
+
end
|
|
319
|
+
matches.sort_by! { |e| e[0] } # sort by offset in file
|
|
317
320
|
end # #fetch_list
|
|
318
321
|
|
|
319
322
|
# Scan the index for blocks matching the given bins and intervals.
|
|
@@ -344,7 +347,7 @@ module Bio
|
|
|
344
347
|
|
|
345
348
|
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
|
346
349
|
start = Time.now
|
|
347
|
-
n_threads = ENV['profile'] ? 1 :
|
|
350
|
+
n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
|
|
348
351
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
|
349
352
|
completed = java.util.concurrent.LinkedBlockingQueue.new(128)
|
|
350
353
|
threads = []
|
|
@@ -445,7 +448,8 @@ module Bio
|
|
|
445
448
|
|
|
446
449
|
def build_default(parser)
|
|
447
450
|
first_block = parser.parse_block
|
|
448
|
-
ref_seq = first_block.sequences.first.source
|
|
451
|
+
self.ref_seq = first_block.sequences.first.source
|
|
452
|
+
db[REF_SEQ_KEY] = ref_seq
|
|
449
453
|
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
|
450
454
|
@index_sequences = { ref_seq => 0 }
|
|
451
455
|
store_index_sequences!
|
|
@@ -521,6 +525,9 @@ module Bio
|
|
|
521
525
|
end
|
|
522
526
|
|
|
523
527
|
def entries_for(block)
|
|
528
|
+
unless block.ref_seq.source == @ref_seq
|
|
529
|
+
raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
|
|
530
|
+
end
|
|
524
531
|
h = {}
|
|
525
532
|
val = build_block_value(block)
|
|
526
533
|
block.sequences.each do |seq|
|
data/lib/bio/maf/maf.rb
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
module Bio
|
|
2
|
+
module MAF
|
|
3
|
+
|
|
4
|
+
# A MAF header, containing the variable-value pairs from the first
|
|
5
|
+
# line of the file as well as the alignment parameters.
|
|
6
|
+
# @api public
|
|
7
|
+
class Header
|
|
8
|
+
# Variable-value pairs from the ##maf line
|
|
9
|
+
# @return [Hash]
|
|
10
|
+
attr_accessor :vars
|
|
11
|
+
# Alignment parameters from the MAF header.
|
|
12
|
+
# @return [Hash]
|
|
13
|
+
attr_accessor :alignment_params
|
|
14
|
+
|
|
15
|
+
def initialize(vars, params)
|
|
16
|
+
@vars = vars
|
|
17
|
+
@alignment_params = params
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# The required version parameter.
|
|
21
|
+
# @return [String]
|
|
22
|
+
def version
|
|
23
|
+
vars[:version]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# The optional scoring parameter, if present.
|
|
27
|
+
# @return [String]
|
|
28
|
+
def scoring
|
|
29
|
+
vars[:scoring]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# A MAF alignment block.
|
|
35
|
+
# @api public
|
|
36
|
+
class Block
|
|
37
|
+
# Parameters from the 'a' line starting the alignment block.
|
|
38
|
+
attr_reader :vars
|
|
39
|
+
# Sequences, one per 's' or 'e' line.
|
|
40
|
+
# @return [Array<Sequence>]
|
|
41
|
+
attr_reader :sequences
|
|
42
|
+
# Offset of the alignment block within the MAF file, in bytes.
|
|
43
|
+
# @return [Integer]
|
|
44
|
+
attr_reader :offset
|
|
45
|
+
# Size of the alignment block within the MAF file, in bytes.
|
|
46
|
+
# @return [Integer]
|
|
47
|
+
attr_reader :size
|
|
48
|
+
|
|
49
|
+
def initialize(vars, sequences, offset, size, filtered)
|
|
50
|
+
@vars = vars
|
|
51
|
+
@sequences = sequences
|
|
52
|
+
@offset = offset
|
|
53
|
+
@size = size
|
|
54
|
+
@filtered = filtered
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def ref_seq
|
|
58
|
+
sequences[0]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def raw_seq(i)
|
|
62
|
+
sequences.fetch(i)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def each_raw_seq
|
|
66
|
+
sequences.each { |s| yield s }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Text size of the alignment block. This is the number of text
|
|
70
|
+
# characters in each line of sequence data, including dashes and
|
|
71
|
+
# other gaps in the sequence.
|
|
72
|
+
def text_size
|
|
73
|
+
sequences.first.text.size
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Whether this block has been modified by a parser filter.
|
|
77
|
+
# @return [Boolean]
|
|
78
|
+
def filtered?
|
|
79
|
+
@filtered
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
GAP = /-+/
|
|
83
|
+
|
|
84
|
+
# Remove gaps present in all sequences. These would generally
|
|
85
|
+
# occur when some sequences have been filtered out.
|
|
86
|
+
# @see #remove_gaps!
|
|
87
|
+
# @see Parser#sequence_filter
|
|
88
|
+
def find_gaps
|
|
89
|
+
ref_s = StringScanner.new(sequences.first.text)
|
|
90
|
+
others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
|
|
91
|
+
gaps = []
|
|
92
|
+
while ref_s.scan_until(GAP)
|
|
93
|
+
offset = ref_s.pos - ref_s.matched_size
|
|
94
|
+
others.each { |s| s.pos = offset }
|
|
95
|
+
unless others.find { |s| ! s.scan(GAP) }
|
|
96
|
+
# all matched
|
|
97
|
+
gap_size = [ref_s.matched_size,
|
|
98
|
+
others.map {|s| s.matched_size}.min].min
|
|
99
|
+
gaps << [offset, gap_size]
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
gaps
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Remove gaps present in all sequences. These would generally
|
|
106
|
+
# occur when some sequences have been filtered out.
|
|
107
|
+
# @see #find_gaps
|
|
108
|
+
# @see Parser#sequence_filter
|
|
109
|
+
def remove_gaps!
|
|
110
|
+
gaps = find_gaps()
|
|
111
|
+
gaps.reverse_each do |offset, len|
|
|
112
|
+
sequences.each do |seq|
|
|
113
|
+
seq.delete_text(offset, len)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
gaps.size
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# A sequence within an alignment block.
|
|
122
|
+
# @api public
|
|
123
|
+
class Sequence
|
|
124
|
+
# @return [String] Source sequence name.
|
|
125
|
+
attr_reader :source
|
|
126
|
+
# @return [Integer] Zero-based start position.
|
|
127
|
+
attr_reader :start
|
|
128
|
+
# @return [Integer] Size of aligning region in source sequence.
|
|
129
|
+
attr_reader :size
|
|
130
|
+
# :+ or :-, indicating which strand the alignment is to.
|
|
131
|
+
# @return [Symbol]
|
|
132
|
+
attr_reader :strand
|
|
133
|
+
# Size of the entire source sequence, not just the aligning
|
|
134
|
+
# region.
|
|
135
|
+
# @return [Integer]
|
|
136
|
+
attr_reader :src_size
|
|
137
|
+
# Sequence data for the alignment, including insertions.
|
|
138
|
+
# @return [String]
|
|
139
|
+
attr_reader :text
|
|
140
|
+
# Array of raw synteny information from 'i' line.
|
|
141
|
+
# @return [Array<String>]
|
|
142
|
+
attr_accessor :i_data
|
|
143
|
+
# Quality string from 'q' line.
|
|
144
|
+
# @return [String]
|
|
145
|
+
attr_accessor :quality
|
|
146
|
+
alias_method :source_size, :src_size
|
|
147
|
+
|
|
148
|
+
def initialize(*args)
|
|
149
|
+
@source, @start, @size, @strand, @src_size, @text = args
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def end
|
|
153
|
+
start + size
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Whether this sequence is empty. Only true for {EmptySequence}
|
|
157
|
+
# instances from 'e' lines.
|
|
158
|
+
def empty?
|
|
159
|
+
false
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def gapped?
|
|
163
|
+
size != text.size
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def species
|
|
167
|
+
parts = source.split('.', 2)
|
|
168
|
+
parts.size == 2 ? parts[0] : nil
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def delete_text(offset, len)
|
|
172
|
+
unless empty?
|
|
173
|
+
text.slice!(offset, len)
|
|
174
|
+
if quality
|
|
175
|
+
quality.slice!(offset, len)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def write_fasta(writer)
|
|
181
|
+
writer.write("#{source}:#{start}-#{start + size}",
|
|
182
|
+
text)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Maps the given zero-based genomic range onto a range of string
|
|
186
|
+
# offsets, suitable for extracting the text for the given range
|
|
187
|
+
# from #text.
|
|
188
|
+
#
|
|
189
|
+
# @see String#slice
|
|
190
|
+
def text_range(range)
|
|
191
|
+
r_end = range.exclude_end? ? range.end : range.end + 1
|
|
192
|
+
r_size = r_end - range.begin
|
|
193
|
+
if range.begin == start && r_size == size
|
|
194
|
+
# special case, entire text
|
|
195
|
+
0...text.size
|
|
196
|
+
else
|
|
197
|
+
if range.begin < start || r_end > self.end
|
|
198
|
+
raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
|
|
199
|
+
end
|
|
200
|
+
if ! gapped?
|
|
201
|
+
# no gaps, can map indexes directly
|
|
202
|
+
(range.begin - start)...(r_end - start)
|
|
203
|
+
else
|
|
204
|
+
# gaps present
|
|
205
|
+
g_start = start # genomic position of the start
|
|
206
|
+
t_start = 0 # text position of the start
|
|
207
|
+
m_begin = nil # beginning of match
|
|
208
|
+
match = nil
|
|
209
|
+
text.scan(/(\w+|-+)/) do |parts|
|
|
210
|
+
part = parts[0]
|
|
211
|
+
if part[0] != '-'
|
|
212
|
+
# sequence text
|
|
213
|
+
g_end = g_start + part.size
|
|
214
|
+
if g_start <= range.begin && range.begin < g_end
|
|
215
|
+
offset_in_part = range.begin - g_start
|
|
216
|
+
m_begin = offset_in_part + t_start
|
|
217
|
+
end
|
|
218
|
+
if g_start <= r_end && r_end <= g_end
|
|
219
|
+
raise "reached end before start!" unless m_begin
|
|
220
|
+
offset_in_part = r_end - g_start
|
|
221
|
+
m_end = offset_in_part + t_start
|
|
222
|
+
match = m_begin...m_end
|
|
223
|
+
break
|
|
224
|
+
end
|
|
225
|
+
g_start = g_end
|
|
226
|
+
else
|
|
227
|
+
# gap
|
|
228
|
+
end
|
|
229
|
+
t_start += part.size
|
|
230
|
+
end
|
|
231
|
+
raise "no match found!" unless match
|
|
232
|
+
return match
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# An empty sequence record from an 'e' line.
|
|
239
|
+
#
|
|
240
|
+
# This indicates that "there isn't aligning DNA for a species but
|
|
241
|
+
# that the current block is bridged by a chain that connects
|
|
242
|
+
# blocks before and after this block" (MAF spec).
|
|
243
|
+
# @api public
|
|
244
|
+
class EmptySequence < Sequence
|
|
245
|
+
attr_reader :status
|
|
246
|
+
|
|
247
|
+
def initialize(*args)
|
|
248
|
+
super(*args[0..4])
|
|
249
|
+
@status = args[5]
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def text
|
|
253
|
+
''
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def empty?
|
|
257
|
+
true
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
def write_fasta(writer)
|
|
261
|
+
raise "empty sequence output not implemented!"
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
end
|