bio-maf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/lib/bio/maf.rb +2 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +58 -3
data/lib/bio/maf/index.rb
CHANGED
@@ -65,10 +65,11 @@ module Bio
|
|
65
65
|
include KVHelpers
|
66
66
|
|
67
67
|
attr_reader :db, :species, :species_max_id
|
68
|
-
attr_accessor :index_sequences
|
68
|
+
attr_accessor :index_sequences, :ref_seq
|
69
69
|
|
70
70
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
71
71
|
FORMAT_VERSION = 2
|
72
|
+
REF_SEQ_KEY = 'bio-maf:reference-sequence'
|
72
73
|
MAX_SPECIES = 64
|
73
74
|
|
74
75
|
## Key-value store index format
|
@@ -221,6 +222,7 @@ module Bio
|
|
221
222
|
raise "Could not open DB file!"
|
222
223
|
end
|
223
224
|
if mode == KyotoCabinet::DB::OREADER
|
225
|
+
self.ref_seq = db[REF_SEQ_KEY]
|
224
226
|
load_index_sequences
|
225
227
|
load_species
|
226
228
|
end
|
@@ -309,11 +311,12 @@ module Bio
|
|
309
311
|
end
|
310
312
|
ready = Time.now
|
311
313
|
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
312
|
-
if RUBY_PLATFORM == 'java'
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
314
|
+
matches = if RUBY_PLATFORM == 'java'
|
315
|
+
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
316
|
+
else
|
317
|
+
scan_bins(chrom_id, bin_intervals, filters)
|
318
|
+
end
|
319
|
+
matches.sort_by! { |e| e[0] } # sort by offset in file
|
317
320
|
end # #fetch_list
|
318
321
|
|
319
322
|
# Scan the index for blocks matching the given bins and intervals.
|
@@ -344,7 +347,7 @@ module Bio
|
|
344
347
|
|
345
348
|
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
346
349
|
start = Time.now
|
347
|
-
n_threads = ENV['profile'] ? 1 :
|
350
|
+
n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
|
348
351
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
349
352
|
completed = java.util.concurrent.LinkedBlockingQueue.new(128)
|
350
353
|
threads = []
|
@@ -445,7 +448,8 @@ module Bio
|
|
445
448
|
|
446
449
|
def build_default(parser)
|
447
450
|
first_block = parser.parse_block
|
448
|
-
ref_seq = first_block.sequences.first.source
|
451
|
+
self.ref_seq = first_block.sequences.first.source
|
452
|
+
db[REF_SEQ_KEY] = ref_seq
|
449
453
|
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
450
454
|
@index_sequences = { ref_seq => 0 }
|
451
455
|
store_index_sequences!
|
@@ -521,6 +525,9 @@ module Bio
|
|
521
525
|
end
|
522
526
|
|
523
527
|
def entries_for(block)
|
528
|
+
unless block.ref_seq.source == @ref_seq
|
529
|
+
raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
|
530
|
+
end
|
524
531
|
h = {}
|
525
532
|
val = build_block_value(block)
|
526
533
|
block.sequences.each do |seq|
|
data/lib/bio/maf/maf.rb
ADDED
@@ -0,0 +1,267 @@
|
|
1
|
+
module Bio
|
2
|
+
module MAF
|
3
|
+
|
4
|
+
# A MAF header, containing the variable-value pairs from the first
|
5
|
+
# line of the file as well as the alignment parameters.
|
6
|
+
# @api public
|
7
|
+
class Header
|
8
|
+
# Variable-value pairs from the ##maf line
|
9
|
+
# @return [Hash]
|
10
|
+
attr_accessor :vars
|
11
|
+
# Alignment parameters from the MAF header.
|
12
|
+
# @return [Hash]
|
13
|
+
attr_accessor :alignment_params
|
14
|
+
|
15
|
+
def initialize(vars, params)
|
16
|
+
@vars = vars
|
17
|
+
@alignment_params = params
|
18
|
+
end
|
19
|
+
|
20
|
+
# The required version parameter.
|
21
|
+
# @return [String]
|
22
|
+
def version
|
23
|
+
vars[:version]
|
24
|
+
end
|
25
|
+
|
26
|
+
# The optional scoring parameter, if present.
|
27
|
+
# @return [String]
|
28
|
+
def scoring
|
29
|
+
vars[:scoring]
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
# A MAF alignment block.
|
35
|
+
# @api public
|
36
|
+
class Block
|
37
|
+
# Parameters from the 'a' line starting the alignment block.
|
38
|
+
attr_reader :vars
|
39
|
+
# Sequences, one per 's' or 'e' line.
|
40
|
+
# @return [Array<Sequence>]
|
41
|
+
attr_reader :sequences
|
42
|
+
# Offset of the alignment block within the MAF file, in bytes.
|
43
|
+
# @return [Integer]
|
44
|
+
attr_reader :offset
|
45
|
+
# Size of the alignment block within the MAF file, in bytes.
|
46
|
+
# @return [Integer]
|
47
|
+
attr_reader :size
|
48
|
+
|
49
|
+
def initialize(vars, sequences, offset, size, filtered)
|
50
|
+
@vars = vars
|
51
|
+
@sequences = sequences
|
52
|
+
@offset = offset
|
53
|
+
@size = size
|
54
|
+
@filtered = filtered
|
55
|
+
end
|
56
|
+
|
57
|
+
def ref_seq
|
58
|
+
sequences[0]
|
59
|
+
end
|
60
|
+
|
61
|
+
def raw_seq(i)
|
62
|
+
sequences.fetch(i)
|
63
|
+
end
|
64
|
+
|
65
|
+
def each_raw_seq
|
66
|
+
sequences.each { |s| yield s }
|
67
|
+
end
|
68
|
+
|
69
|
+
# Text size of the alignment block. This is the number of text
|
70
|
+
# characters in each line of sequence data, including dashes and
|
71
|
+
# other gaps in the sequence.
|
72
|
+
def text_size
|
73
|
+
sequences.first.text.size
|
74
|
+
end
|
75
|
+
|
76
|
+
# Whether this block has been modified by a parser filter.
|
77
|
+
# @return [Boolean]
|
78
|
+
def filtered?
|
79
|
+
@filtered
|
80
|
+
end
|
81
|
+
|
82
|
+
GAP = /-+/
|
83
|
+
|
84
|
+
# Remove gaps present in all sequences. These would generally
|
85
|
+
# occur when some sequences have been filtered out.
|
86
|
+
# @see #remove_gaps!
|
87
|
+
# @see Parser#sequence_filter
|
88
|
+
def find_gaps
|
89
|
+
ref_s = StringScanner.new(sequences.first.text)
|
90
|
+
others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
|
91
|
+
gaps = []
|
92
|
+
while ref_s.scan_until(GAP)
|
93
|
+
offset = ref_s.pos - ref_s.matched_size
|
94
|
+
others.each { |s| s.pos = offset }
|
95
|
+
unless others.find { |s| ! s.scan(GAP) }
|
96
|
+
# all matched
|
97
|
+
gap_size = [ref_s.matched_size,
|
98
|
+
others.map {|s| s.matched_size}.min].min
|
99
|
+
gaps << [offset, gap_size]
|
100
|
+
end
|
101
|
+
end
|
102
|
+
gaps
|
103
|
+
end
|
104
|
+
|
105
|
+
# Remove gaps present in all sequences. These would generally
|
106
|
+
# occur when some sequences have been filtered out.
|
107
|
+
# @see #find_gaps
|
108
|
+
# @see Parser#sequence_filter
|
109
|
+
def remove_gaps!
|
110
|
+
gaps = find_gaps()
|
111
|
+
gaps.reverse_each do |offset, len|
|
112
|
+
sequences.each do |seq|
|
113
|
+
seq.delete_text(offset, len)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
gaps.size
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
# A sequence within an alignment block.
|
122
|
+
# @api public
|
123
|
+
class Sequence
|
124
|
+
# @return [String] Source sequence name.
|
125
|
+
attr_reader :source
|
126
|
+
# @return [Integer] Zero-based start position.
|
127
|
+
attr_reader :start
|
128
|
+
# @return [Integer] Size of aligning region in source sequence.
|
129
|
+
attr_reader :size
|
130
|
+
# :+ or :-, indicating which strand the alignment is to.
|
131
|
+
# @return [Symbol]
|
132
|
+
attr_reader :strand
|
133
|
+
# Size of the entire source sequence, not just the aligning
|
134
|
+
# region.
|
135
|
+
# @return [Integer]
|
136
|
+
attr_reader :src_size
|
137
|
+
# Sequence data for the alignment, including insertions.
|
138
|
+
# @return [String]
|
139
|
+
attr_reader :text
|
140
|
+
# Array of raw synteny information from 'i' line.
|
141
|
+
# @return [Array<String>]
|
142
|
+
attr_accessor :i_data
|
143
|
+
# Quality string from 'q' line.
|
144
|
+
# @return [String]
|
145
|
+
attr_accessor :quality
|
146
|
+
alias_method :source_size, :src_size
|
147
|
+
|
148
|
+
def initialize(*args)
|
149
|
+
@source, @start, @size, @strand, @src_size, @text = args
|
150
|
+
end
|
151
|
+
|
152
|
+
def end
|
153
|
+
start + size
|
154
|
+
end
|
155
|
+
|
156
|
+
# Whether this sequence is empty. Only true for {EmptySequence}
|
157
|
+
# instances from 'e' lines.
|
158
|
+
def empty?
|
159
|
+
false
|
160
|
+
end
|
161
|
+
|
162
|
+
def gapped?
|
163
|
+
size != text.size
|
164
|
+
end
|
165
|
+
|
166
|
+
def species
|
167
|
+
parts = source.split('.', 2)
|
168
|
+
parts.size == 2 ? parts[0] : nil
|
169
|
+
end
|
170
|
+
|
171
|
+
def delete_text(offset, len)
|
172
|
+
unless empty?
|
173
|
+
text.slice!(offset, len)
|
174
|
+
if quality
|
175
|
+
quality.slice!(offset, len)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def write_fasta(writer)
|
181
|
+
writer.write("#{source}:#{start}-#{start + size}",
|
182
|
+
text)
|
183
|
+
end
|
184
|
+
|
185
|
+
# Maps the given zero-based genomic range onto a range of string
|
186
|
+
# offsets, suitable for extracting the text for the given range
|
187
|
+
# from #text.
|
188
|
+
#
|
189
|
+
# @see String#slice
|
190
|
+
def text_range(range)
|
191
|
+
r_end = range.exclude_end? ? range.end : range.end + 1
|
192
|
+
r_size = r_end - range.begin
|
193
|
+
if range.begin == start && r_size == size
|
194
|
+
# special case, entire text
|
195
|
+
0...text.size
|
196
|
+
else
|
197
|
+
if range.begin < start || r_end > self.end
|
198
|
+
raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
|
199
|
+
end
|
200
|
+
if ! gapped?
|
201
|
+
# no gaps, can map indexes directly
|
202
|
+
(range.begin - start)...(r_end - start)
|
203
|
+
else
|
204
|
+
# gaps present
|
205
|
+
g_start = start # genomic position of the start
|
206
|
+
t_start = 0 # text position of the start
|
207
|
+
m_begin = nil # beginning of match
|
208
|
+
match = nil
|
209
|
+
text.scan(/(\w+|-+)/) do |parts|
|
210
|
+
part = parts[0]
|
211
|
+
if part[0] != '-'
|
212
|
+
# sequence text
|
213
|
+
g_end = g_start + part.size
|
214
|
+
if g_start <= range.begin && range.begin < g_end
|
215
|
+
offset_in_part = range.begin - g_start
|
216
|
+
m_begin = offset_in_part + t_start
|
217
|
+
end
|
218
|
+
if g_start <= r_end && r_end <= g_end
|
219
|
+
raise "reached end before start!" unless m_begin
|
220
|
+
offset_in_part = r_end - g_start
|
221
|
+
m_end = offset_in_part + t_start
|
222
|
+
match = m_begin...m_end
|
223
|
+
break
|
224
|
+
end
|
225
|
+
g_start = g_end
|
226
|
+
else
|
227
|
+
# gap
|
228
|
+
end
|
229
|
+
t_start += part.size
|
230
|
+
end
|
231
|
+
raise "no match found!" unless match
|
232
|
+
return match
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# An empty sequence record from an 'e' line.
|
239
|
+
#
|
240
|
+
# This indicates that "there isn't aligning DNA for a species but
|
241
|
+
# that the current block is bridged by a chain that connects
|
242
|
+
# blocks before and after this block" (MAF spec).
|
243
|
+
# @api public
|
244
|
+
class EmptySequence < Sequence
|
245
|
+
attr_reader :status
|
246
|
+
|
247
|
+
def initialize(*args)
|
248
|
+
super(*args[0..4])
|
249
|
+
@status = args[5]
|
250
|
+
end
|
251
|
+
|
252
|
+
def text
|
253
|
+
''
|
254
|
+
end
|
255
|
+
|
256
|
+
def empty?
|
257
|
+
true
|
258
|
+
end
|
259
|
+
|
260
|
+
def write_fasta(writer)
|
261
|
+
raise "empty sequence output not implemented!"
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
end
|
266
|
+
|
267
|
+
end
|