bio-maf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/lib/bio/maf.rb +2 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +58 -3
data/lib/bio/maf/index.rb
CHANGED
@@ -65,10 +65,11 @@ module Bio
|
|
65
65
|
include KVHelpers
|
66
66
|
|
67
67
|
attr_reader :db, :species, :species_max_id
|
68
|
-
attr_accessor :index_sequences
|
68
|
+
attr_accessor :index_sequences, :ref_seq
|
69
69
|
|
70
70
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
71
71
|
FORMAT_VERSION = 2
|
72
|
+
REF_SEQ_KEY = 'bio-maf:reference-sequence'
|
72
73
|
MAX_SPECIES = 64
|
73
74
|
|
74
75
|
## Key-value store index format
|
@@ -221,6 +222,7 @@ module Bio
|
|
221
222
|
raise "Could not open DB file!"
|
222
223
|
end
|
223
224
|
if mode == KyotoCabinet::DB::OREADER
|
225
|
+
self.ref_seq = db[REF_SEQ_KEY]
|
224
226
|
load_index_sequences
|
225
227
|
load_species
|
226
228
|
end
|
@@ -309,11 +311,12 @@ module Bio
|
|
309
311
|
end
|
310
312
|
ready = Time.now
|
311
313
|
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
312
|
-
if RUBY_PLATFORM == 'java'
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
314
|
+
matches = if RUBY_PLATFORM == 'java'
|
315
|
+
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
316
|
+
else
|
317
|
+
scan_bins(chrom_id, bin_intervals, filters)
|
318
|
+
end
|
319
|
+
matches.sort_by! { |e| e[0] } # sort by offset in file
|
317
320
|
end # #fetch_list
|
318
321
|
|
319
322
|
# Scan the index for blocks matching the given bins and intervals.
|
@@ -344,7 +347,7 @@ module Bio
|
|
344
347
|
|
345
348
|
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
346
349
|
start = Time.now
|
347
|
-
n_threads = ENV['profile'] ? 1 :
|
350
|
+
n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
|
348
351
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
349
352
|
completed = java.util.concurrent.LinkedBlockingQueue.new(128)
|
350
353
|
threads = []
|
@@ -445,7 +448,8 @@ module Bio
|
|
445
448
|
|
446
449
|
def build_default(parser)
|
447
450
|
first_block = parser.parse_block
|
448
|
-
ref_seq = first_block.sequences.first.source
|
451
|
+
self.ref_seq = first_block.sequences.first.source
|
452
|
+
db[REF_SEQ_KEY] = ref_seq
|
449
453
|
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
450
454
|
@index_sequences = { ref_seq => 0 }
|
451
455
|
store_index_sequences!
|
@@ -521,6 +525,9 @@ module Bio
|
|
521
525
|
end
|
522
526
|
|
523
527
|
def entries_for(block)
|
528
|
+
unless block.ref_seq.source == @ref_seq
|
529
|
+
raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
|
530
|
+
end
|
524
531
|
h = {}
|
525
532
|
val = build_block_value(block)
|
526
533
|
block.sequences.each do |seq|
|
data/lib/bio/maf/maf.rb
ADDED
@@ -0,0 +1,267 @@
|
|
1
|
+
module Bio
|
2
|
+
module MAF
|
3
|
+
|
4
|
+
# A MAF header, containing the variable-value pairs from the first
|
5
|
+
# line of the file as well as the alignment parameters.
|
6
|
+
# @api public
|
7
|
+
class Header
|
8
|
+
# Variable-value pairs from the ##maf line
|
9
|
+
# @return [Hash]
|
10
|
+
attr_accessor :vars
|
11
|
+
# Alignment parameters from the MAF header.
|
12
|
+
# @return [Hash]
|
13
|
+
attr_accessor :alignment_params
|
14
|
+
|
15
|
+
def initialize(vars, params)
|
16
|
+
@vars = vars
|
17
|
+
@alignment_params = params
|
18
|
+
end
|
19
|
+
|
20
|
+
# The required version parameter.
|
21
|
+
# @return [String]
|
22
|
+
def version
|
23
|
+
vars[:version]
|
24
|
+
end
|
25
|
+
|
26
|
+
# The optional scoring parameter, if present.
|
27
|
+
# @return [String]
|
28
|
+
def scoring
|
29
|
+
vars[:scoring]
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
# A MAF alignment block.
|
35
|
+
# @api public
|
36
|
+
class Block
|
37
|
+
# Parameters from the 'a' line starting the alignment block.
|
38
|
+
attr_reader :vars
|
39
|
+
# Sequences, one per 's' or 'e' line.
|
40
|
+
# @return [Array<Sequence>]
|
41
|
+
attr_reader :sequences
|
42
|
+
# Offset of the alignment block within the MAF file, in bytes.
|
43
|
+
# @return [Integer]
|
44
|
+
attr_reader :offset
|
45
|
+
# Size of the alignment block within the MAF file, in bytes.
|
46
|
+
# @return [Integer]
|
47
|
+
attr_reader :size
|
48
|
+
|
49
|
+
def initialize(vars, sequences, offset, size, filtered)
|
50
|
+
@vars = vars
|
51
|
+
@sequences = sequences
|
52
|
+
@offset = offset
|
53
|
+
@size = size
|
54
|
+
@filtered = filtered
|
55
|
+
end
|
56
|
+
|
57
|
+
def ref_seq
|
58
|
+
sequences[0]
|
59
|
+
end
|
60
|
+
|
61
|
+
def raw_seq(i)
|
62
|
+
sequences.fetch(i)
|
63
|
+
end
|
64
|
+
|
65
|
+
def each_raw_seq
|
66
|
+
sequences.each { |s| yield s }
|
67
|
+
end
|
68
|
+
|
69
|
+
# Text size of the alignment block. This is the number of text
|
70
|
+
# characters in each line of sequence data, including dashes and
|
71
|
+
# other gaps in the sequence.
|
72
|
+
def text_size
|
73
|
+
sequences.first.text.size
|
74
|
+
end
|
75
|
+
|
76
|
+
# Whether this block has been modified by a parser filter.
|
77
|
+
# @return [Boolean]
|
78
|
+
def filtered?
|
79
|
+
@filtered
|
80
|
+
end
|
81
|
+
|
82
|
+
GAP = /-+/
|
83
|
+
|
84
|
+
# Remove gaps present in all sequences. These would generally
|
85
|
+
# occur when some sequences have been filtered out.
|
86
|
+
# @see #remove_gaps!
|
87
|
+
# @see Parser#sequence_filter
|
88
|
+
def find_gaps
|
89
|
+
ref_s = StringScanner.new(sequences.first.text)
|
90
|
+
others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
|
91
|
+
gaps = []
|
92
|
+
while ref_s.scan_until(GAP)
|
93
|
+
offset = ref_s.pos - ref_s.matched_size
|
94
|
+
others.each { |s| s.pos = offset }
|
95
|
+
unless others.find { |s| ! s.scan(GAP) }
|
96
|
+
# all matched
|
97
|
+
gap_size = [ref_s.matched_size,
|
98
|
+
others.map {|s| s.matched_size}.min].min
|
99
|
+
gaps << [offset, gap_size]
|
100
|
+
end
|
101
|
+
end
|
102
|
+
gaps
|
103
|
+
end
|
104
|
+
|
105
|
+
# Remove gaps present in all sequences. These would generally
|
106
|
+
# occur when some sequences have been filtered out.
|
107
|
+
# @see #find_gaps
|
108
|
+
# @see Parser#sequence_filter
|
109
|
+
def remove_gaps!
|
110
|
+
gaps = find_gaps()
|
111
|
+
gaps.reverse_each do |offset, len|
|
112
|
+
sequences.each do |seq|
|
113
|
+
seq.delete_text(offset, len)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
gaps.size
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
# A sequence within an alignment block.
|
122
|
+
# @api public
|
123
|
+
class Sequence
|
124
|
+
# @return [String] Source sequence name.
|
125
|
+
attr_reader :source
|
126
|
+
# @return [Integer] Zero-based start position.
|
127
|
+
attr_reader :start
|
128
|
+
# @return [Integer] Size of aligning region in source sequence.
|
129
|
+
attr_reader :size
|
130
|
+
# :+ or :-, indicating which strand the alignment is to.
|
131
|
+
# @return [Symbol]
|
132
|
+
attr_reader :strand
|
133
|
+
# Size of the entire source sequence, not just the aligning
|
134
|
+
# region.
|
135
|
+
# @return [Integer]
|
136
|
+
attr_reader :src_size
|
137
|
+
# Sequence data for the alignment, including insertions.
|
138
|
+
# @return [String]
|
139
|
+
attr_reader :text
|
140
|
+
# Array of raw synteny information from 'i' line.
|
141
|
+
# @return [Array<String>]
|
142
|
+
attr_accessor :i_data
|
143
|
+
# Quality string from 'q' line.
|
144
|
+
# @return [String]
|
145
|
+
attr_accessor :quality
|
146
|
+
alias_method :source_size, :src_size
|
147
|
+
|
148
|
+
def initialize(*args)
|
149
|
+
@source, @start, @size, @strand, @src_size, @text = args
|
150
|
+
end
|
151
|
+
|
152
|
+
def end
|
153
|
+
start + size
|
154
|
+
end
|
155
|
+
|
156
|
+
# Whether this sequence is empty. Only true for {EmptySequence}
|
157
|
+
# instances from 'e' lines.
|
158
|
+
def empty?
|
159
|
+
false
|
160
|
+
end
|
161
|
+
|
162
|
+
def gapped?
|
163
|
+
size != text.size
|
164
|
+
end
|
165
|
+
|
166
|
+
def species
|
167
|
+
parts = source.split('.', 2)
|
168
|
+
parts.size == 2 ? parts[0] : nil
|
169
|
+
end
|
170
|
+
|
171
|
+
def delete_text(offset, len)
|
172
|
+
unless empty?
|
173
|
+
text.slice!(offset, len)
|
174
|
+
if quality
|
175
|
+
quality.slice!(offset, len)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def write_fasta(writer)
|
181
|
+
writer.write("#{source}:#{start}-#{start + size}",
|
182
|
+
text)
|
183
|
+
end
|
184
|
+
|
185
|
+
# Maps the given zero-based genomic range onto a range of string
|
186
|
+
# offsets, suitable for extracting the text for the given range
|
187
|
+
# from #text.
|
188
|
+
#
|
189
|
+
# @see String#slice
|
190
|
+
def text_range(range)
|
191
|
+
r_end = range.exclude_end? ? range.end : range.end + 1
|
192
|
+
r_size = r_end - range.begin
|
193
|
+
if range.begin == start && r_size == size
|
194
|
+
# special case, entire text
|
195
|
+
0...text.size
|
196
|
+
else
|
197
|
+
if range.begin < start || r_end > self.end
|
198
|
+
raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
|
199
|
+
end
|
200
|
+
if ! gapped?
|
201
|
+
# no gaps, can map indexes directly
|
202
|
+
(range.begin - start)...(r_end - start)
|
203
|
+
else
|
204
|
+
# gaps present
|
205
|
+
g_start = start # genomic position of the start
|
206
|
+
t_start = 0 # text position of the start
|
207
|
+
m_begin = nil # beginning of match
|
208
|
+
match = nil
|
209
|
+
text.scan(/(\w+|-+)/) do |parts|
|
210
|
+
part = parts[0]
|
211
|
+
if part[0] != '-'
|
212
|
+
# sequence text
|
213
|
+
g_end = g_start + part.size
|
214
|
+
if g_start <= range.begin && range.begin < g_end
|
215
|
+
offset_in_part = range.begin - g_start
|
216
|
+
m_begin = offset_in_part + t_start
|
217
|
+
end
|
218
|
+
if g_start <= r_end && r_end <= g_end
|
219
|
+
raise "reached end before start!" unless m_begin
|
220
|
+
offset_in_part = r_end - g_start
|
221
|
+
m_end = offset_in_part + t_start
|
222
|
+
match = m_begin...m_end
|
223
|
+
break
|
224
|
+
end
|
225
|
+
g_start = g_end
|
226
|
+
else
|
227
|
+
# gap
|
228
|
+
end
|
229
|
+
t_start += part.size
|
230
|
+
end
|
231
|
+
raise "no match found!" unless match
|
232
|
+
return match
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# An empty sequence record from an 'e' line.
|
239
|
+
#
|
240
|
+
# This indicates that "there isn't aligning DNA for a species but
|
241
|
+
# that the current block is bridged by a chain that connects
|
242
|
+
# blocks before and after this block" (MAF spec).
|
243
|
+
# @api public
|
244
|
+
class EmptySequence < Sequence
|
245
|
+
attr_reader :status
|
246
|
+
|
247
|
+
def initialize(*args)
|
248
|
+
super(*args[0..4])
|
249
|
+
@status = args[5]
|
250
|
+
end
|
251
|
+
|
252
|
+
def text
|
253
|
+
''
|
254
|
+
end
|
255
|
+
|
256
|
+
def empty?
|
257
|
+
true
|
258
|
+
end
|
259
|
+
|
260
|
+
def write_fasta(writer)
|
261
|
+
raise "empty sequence output not implemented!"
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
end
|
266
|
+
|
267
|
+
end
|