bio-maf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,620 @@
|
|
1
|
+
require 'kyotocabinet'
|
2
|
+
require 'jruby/profiler' if RUBY_PLATFORM == 'java'
|
3
|
+
|
4
|
+
#require 'bio-ucsc-api'
|
5
|
+
require 'bio-genomic-interval'
|
6
|
+
|
7
|
+
module Bio
|
8
|
+
|
9
|
+
module MAF
|
10
|
+
|
11
|
+
# Binary record packing and unpacking.
|
12
|
+
# @api private
|
13
|
+
module KVHelpers
|
14
|
+
|
15
|
+
KEY = Struct.new([[:marker, :uint8],
|
16
|
+
[:seq_id, :uint8],
|
17
|
+
[:bin, :uint16],
|
18
|
+
[:seq_start, :uint32],
|
19
|
+
[:seq_end, :uint32]])
|
20
|
+
|
21
|
+
VAL = Struct.new([[:offset, :uint64],
|
22
|
+
[:length, :uint32],
|
23
|
+
[:text_size, :uint32],
|
24
|
+
[:n_seq, :uint8],
|
25
|
+
[:species_vec, :uint64]])
|
26
|
+
|
27
|
+
KEY_FMT = KEY.fmt
|
28
|
+
KEY_SCAN_FMT = KEY.extractor_fmt(:seq_id, :bin, :seq_start, :seq_end)
|
29
|
+
CHROM_BIN_PREFIX_FMT = KEY.extractor_fmt(:marker, :seq_id, :bin)
|
30
|
+
|
31
|
+
VAL_FMT = VAL.fmt
|
32
|
+
VAL_IDX_OFFSET_FMT = VAL.extractor_fmt(:offset, :length)
|
33
|
+
VAL_TEXT_SIZE_FMT = VAL.extractor_fmt(:text_size)
|
34
|
+
VAL_N_SEQ_FMT = VAL.extractor_fmt(:n_seq)
|
35
|
+
VAL_SPECIES_FMT = VAL.extractor_fmt(:species_vec)
|
36
|
+
|
37
|
+
module_function
|
38
|
+
|
39
|
+
def extract_species_vec(entry)
|
40
|
+
entry[1].unpack(VAL_SPECIES_FMT)[0]
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_n_sequences(entry)
|
44
|
+
entry[1].unpack(VAL_N_SEQ_FMT)[0]
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_index_offset(entry)
|
48
|
+
entry[1].unpack(VAL_IDX_OFFSET_FMT)
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_text_size(entry)
|
52
|
+
entry[1].unpack(VAL_TEXT_SIZE_FMT)[0]
|
53
|
+
end
|
54
|
+
|
55
|
+
def unpack_key(ks)
|
56
|
+
ks.unpack(KEY_FMT)
|
57
|
+
end
|
58
|
+
|
59
|
+
def bin_start_prefix(chrom_id, bin)
|
60
|
+
[0xFF, chrom_id, bin].pack(CHROM_BIN_PREFIX_FMT)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class KyotoIndex
|
65
|
+
include KVHelpers
|
66
|
+
|
67
|
+
attr_reader :db, :species, :species_max_id
|
68
|
+
attr_accessor :index_sequences
|
69
|
+
|
70
|
+
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
71
|
+
FORMAT_VERSION = 2
|
72
|
+
MAX_SPECIES = 64
|
73
|
+
|
74
|
+
## Key-value store index format
|
75
|
+
##
|
76
|
+
## This format is designed for Kyoto Cabinet but should work on
|
77
|
+
## other key-value databases allowing binary data.
|
78
|
+
##
|
79
|
+
## Index metadata is stored as ASCII text, but index data is
|
80
|
+
## stored as packed binary values.
|
81
|
+
##
|
82
|
+
## Index metadata:
|
83
|
+
##
|
84
|
+
## Sequence IDs:
|
85
|
+
## sequence:<name> => <id>
|
86
|
+
##
|
87
|
+
## Each indexed sequence has a corresponding entry of this
|
88
|
+
## kind. The <name> parameter is the sequence or chromosome
|
89
|
+
## name as found in the MAF file, e.g. mm8.chr7. The <id>
|
90
|
+
## parameter is assigned when the sequence is indexed, and
|
91
|
+
## can be from 0 to 255.
|
92
|
+
##
|
93
|
+
## Species IDs:
|
94
|
+
## species:<name> => <id>
|
95
|
+
##
|
96
|
+
## Each indexed species has a corresponding entry of this
|
97
|
+
## kind. The <name> parameter is the species part of the
|
98
|
+
## sequence name as found in the MAF file, e.g. 'mm8' for
|
99
|
+
## 'mm8.chr7'. The <id> parameter is assigned when the
|
100
|
+
## species is indexed, and can be from 0 to 255.
|
101
|
+
##
|
102
|
+
## Index data:
|
103
|
+
##
|
104
|
+
## For each sequence upon which an index is built, one index
|
105
|
+
## entry is generated per MAF alignment block. The key
|
106
|
+
## identifies the sequence, the UCSC index bin, and the
|
107
|
+
## zero-based start and end positions of the sequence. The
|
108
|
+
## value gives the offset and size of the alignment block
|
109
|
+
## within the MAF file.
|
110
|
+
##
|
111
|
+
## All values are stored as big-endian, unsigned packed binary
|
112
|
+
## data.
|
113
|
+
##
|
114
|
+
## Keys: (12 bytes) [CCS>L>L>]
|
115
|
+
##
|
116
|
+
## 0xFF (1 byte):
|
117
|
+
## index entry prefix
|
118
|
+
## Sequence chromosome ID (1 byte):
|
119
|
+
## corresponds to sequence:<name> entries
|
120
|
+
## UCSC bin (16 bits)
|
121
|
+
## Sequence start, zero-based, inclusive (32 bits)
|
122
|
+
## Sequence end, zero-based, exclusive (32 bits)
|
123
|
+
##
|
124
|
+
## Values (25 bytes) [Q>L>L>CQ>]
|
125
|
+
##
|
126
|
+
## MAF file offset (64 bits)
|
127
|
+
## MAF alignment block length (32 bits)
|
128
|
+
## Block text size (32 bits)
|
129
|
+
## Number of sequences in block (8 bits)
|
130
|
+
## Species bit vector (64 bits)
|
131
|
+
##
|
132
|
+
## Example:
|
133
|
+
##
|
134
|
+
## For a block with sequence 0, bin 1195, start 80082334, end
|
135
|
+
## 80082368, MAF offset 16, and MAF block length 1087:
|
136
|
+
##
|
137
|
+
## | |id| bin | seq_start | seq_end |
|
138
|
+
## key: FF 00 04 AB 04 C5 F5 9E 04 C5 F5 C0
|
139
|
+
##
|
140
|
+
## | offset | length | ts |ns| species_vec |
|
141
|
+
## val: 00 00 00 00 00 00 00 10 00 00 04 3F [TODO]
|
142
|
+
|
143
|
+
#### Public API
|
144
|
+
|
145
|
+
# Open an existing index for reading.
|
146
|
+
# @param [String] path path to existing Kyoto Cabinet index
|
147
|
+
# @return [KyotoIndex]
|
148
|
+
def self.open(path)
|
149
|
+
return KyotoIndex.new(path)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Build a new index from the MAF file being parsed by `parser`,
|
153
|
+
# and store it in `path`.
|
154
|
+
# @param [Parser] parser MAF parser for file to index
|
155
|
+
# @param [String] path path to index file to create
|
156
|
+
# @return [KyotoIndex]
|
157
|
+
def self.build(parser, path)
|
158
|
+
idx = self.new(path)
|
159
|
+
idx.build_default(parser)
|
160
|
+
return idx
|
161
|
+
end
|
162
|
+
|
163
|
+
# Find all alignment blocks in the genomic regions in the list
|
164
|
+
# of Bio::GenomicInterval objects, and parse them with the given
|
165
|
+
# parser.
|
166
|
+
#
|
167
|
+
# An optional Hash of filters may be passed in. The following
|
168
|
+
# keys are used:
|
169
|
+
#
|
170
|
+
# * `:with_all_species => ["sp1", "sp2", ...]`
|
171
|
+
#
|
172
|
+
# Only match alignment blocks containing all given species.
|
173
|
+
#
|
174
|
+
# * `:at_least_n_sequences => n`
|
175
|
+
#
|
176
|
+
# Only match alignment blocks with at least N sequences.
|
177
|
+
#
|
178
|
+
# * `:min_size => n`
|
179
|
+
#
|
180
|
+
# Only match alignment blocks with text size at least N.
|
181
|
+
#
|
182
|
+
# * `:max_size => n`
|
183
|
+
#
|
184
|
+
# Only match alignment blocks with text size at most N.
|
185
|
+
#
|
186
|
+
# @param [Enumerable<Bio::GenomicInterval>] intervals genomic
|
187
|
+
# intervals to parse.
|
188
|
+
# @param [Parser] parser MAF parser for file to fetch blocks
|
189
|
+
# from.
|
190
|
+
# @param [Hash] filter Block filter expression.
|
191
|
+
# @return [Array<Block>]
|
192
|
+
# @api public
|
193
|
+
def find(intervals, parser, filter={})
|
194
|
+
start = Time.now
|
195
|
+
fl = fetch_list(intervals, filter)
|
196
|
+
$stderr.printf("Built fetch list of %d items in %.3fs.\n",
|
197
|
+
fl.size,
|
198
|
+
Time.now - start)
|
199
|
+
parser.fetch_blocks(fl)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Close the underlying Kyoto Cabinet database handle.
|
203
|
+
def close
|
204
|
+
db.close
|
205
|
+
end
|
206
|
+
|
207
|
+
#### KyotoIndex Internals
|
208
|
+
# @api private
|
209
|
+
|
210
|
+
def initialize(path, db_arg=nil)
|
211
|
+
@species = {}
|
212
|
+
@species_max_id = -1
|
213
|
+
if db_arg || ((path.size > 1) and File.exist?(path))
|
214
|
+
mode = KyotoCabinet::DB::OREADER
|
215
|
+
else
|
216
|
+
mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
|
217
|
+
end
|
218
|
+
@db = db_arg || KyotoCabinet::DB.new
|
219
|
+
@path = path
|
220
|
+
unless db_arg || db.open(path.to_s, mode)
|
221
|
+
raise "Could not open DB file!"
|
222
|
+
end
|
223
|
+
if mode == KyotoCabinet::DB::OREADER
|
224
|
+
load_index_sequences
|
225
|
+
load_species
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# Reopen the same DB handle read-only. Only useful for unit tests.
|
230
|
+
def reopen
|
231
|
+
KyotoIndex.new(@path, @db)
|
232
|
+
end
|
233
|
+
|
234
|
+
def dump(stream=$stdout)
|
235
|
+
stream.puts "KyotoIndex dump: #{@path}"
|
236
|
+
stream.puts
|
237
|
+
if db.count == 0
|
238
|
+
stream.puts "Empty database!"
|
239
|
+
return
|
240
|
+
end
|
241
|
+
db.cursor_process do |cur|
|
242
|
+
stream.puts "== Metadata =="
|
243
|
+
cur.jump('')
|
244
|
+
while true
|
245
|
+
k, v = cur.get(false)
|
246
|
+
raise "unexpected end of records!" unless k
|
247
|
+
break if k[0] == "\xff"
|
248
|
+
stream.puts "#{k}: #{v}"
|
249
|
+
unless cur.step
|
250
|
+
raise "could not advance cursor!"
|
251
|
+
end
|
252
|
+
end
|
253
|
+
stream.puts "== Index records =="
|
254
|
+
while pair = cur.get(true)
|
255
|
+
_, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
|
256
|
+
offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
|
257
|
+
stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
|
258
|
+
stream.puts " offset #{offset}, length #{len}"
|
259
|
+
stream.puts " text size: #{text_size}"
|
260
|
+
stream.puts " sequences in block: #{n_seq}"
|
261
|
+
stream.printf(" species vector: %016x\n", species_vec)
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
## Retrieval:
|
267
|
+
## 1. merge the intervals of interest
|
268
|
+
## 2. for each interval, compute the bins with #bin_all
|
269
|
+
## 3. for each bin to search, make a list of intervals of
|
270
|
+
## interest
|
271
|
+
## 4. compute the spanning interval for that bin
|
272
|
+
## 5. start at the beginning of the bin
|
273
|
+
## 6. if a record intersects the spanning interval:
|
274
|
+
## A. #find an interval it intersects
|
275
|
+
## B. if found, add to the fetch list
|
276
|
+
## 7. if a record starts past the end of the spanning interval,
|
277
|
+
## we are done scanning this bin.
|
278
|
+
##
|
279
|
+
## Optimizations:
|
280
|
+
## * once we reach the start of the spanning interval,
|
281
|
+
## all records start in it until we see a record starting
|
282
|
+
## past it.
|
283
|
+
## * as record starts pass the start of intervals of interest,
|
284
|
+
## pull those intervals off the list
|
285
|
+
|
286
|
+
# Build a fetch list of alignment blocks to read, given an array
|
287
|
+
# of Bio::GenomicInterval objects
|
288
|
+
def fetch_list(intervals, filter_spec={})
|
289
|
+
start = Time.now
|
290
|
+
filter_spec ||= {}
|
291
|
+
filters = Filters.build(filter_spec, self)
|
292
|
+
chrom = intervals.first.chrom
|
293
|
+
chrom_id = index_sequences[chrom]
|
294
|
+
unless chrom_id
|
295
|
+
raise "chromosome #{chrom} not indexed!"
|
296
|
+
end
|
297
|
+
if intervals.find { |i| i.chrom != chrom }
|
298
|
+
raise "all intervals must be for the same chromosome!"
|
299
|
+
end
|
300
|
+
# for each bin, build a list of the intervals to look for there
|
301
|
+
bin_intervals = Hash.new { |h, k| h[k] = [] }
|
302
|
+
intervals.each do |i|
|
303
|
+
i.bin_all.each do |bin|
|
304
|
+
bin_intervals[bin] << (i.zero_start...i.zero_end)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
bin_intervals.values.each do |intervals|
|
308
|
+
intervals.sort_by! {|i| i.begin}
|
309
|
+
end
|
310
|
+
ready = Time.now
|
311
|
+
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
312
|
+
if RUBY_PLATFORM == 'java'
|
313
|
+
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
314
|
+
else
|
315
|
+
scan_bins(chrom_id, bin_intervals, filters)
|
316
|
+
end
|
317
|
+
end # #fetch_list
|
318
|
+
|
319
|
+
# Scan the index for blocks matching the given bins and intervals.
|
320
|
+
def scan_bins(chrom_id, bin_intervals, filters)
|
321
|
+
to_fetch = []
|
322
|
+
db.cursor_process do |cur|
|
323
|
+
bin_intervals.each do |bin, bin_intervals_raw|
|
324
|
+
matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
|
325
|
+
to_fetch.concat(matches)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
to_fetch
|
329
|
+
end
|
330
|
+
|
331
|
+
def with_profiling
|
332
|
+
if RUBY_PLATFORM == 'java' && ENV['profile']
|
333
|
+
rv = nil
|
334
|
+
pdata = JRuby::Profiler.profile do
|
335
|
+
rv = yield
|
336
|
+
end
|
337
|
+
printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
|
338
|
+
printer.printProfile(STDERR)
|
339
|
+
return rv
|
340
|
+
else
|
341
|
+
yield
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
346
|
+
start = Time.now
|
347
|
+
n_threads = ENV['profile'] ? 1 : 4
|
348
|
+
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
349
|
+
completed = java.util.concurrent.LinkedBlockingQueue.new(128)
|
350
|
+
threads = []
|
351
|
+
n_threads.times do
|
352
|
+
threads << make_scan_worker(jobs, completed) do |cur, req|
|
353
|
+
bin, intervals = req
|
354
|
+
scan_bin(cur, chrom_id, bin, intervals, filters)
|
355
|
+
end
|
356
|
+
end
|
357
|
+
n_completed = 0
|
358
|
+
to_fetch = []
|
359
|
+
while (n_completed < bin_intervals.size)
|
360
|
+
c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
|
361
|
+
if c.nil?
|
362
|
+
if threads.find { |t| t.alive? }
|
363
|
+
next
|
364
|
+
else
|
365
|
+
raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
raise "worker failed: #{c}" if c.is_a? Exception
|
369
|
+
to_fetch.concat(c)
|
370
|
+
n_completed += 1
|
371
|
+
end
|
372
|
+
threads.each { |t| t.join }
|
373
|
+
$stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
|
374
|
+
to_fetch.size, n_threads, Time.now - start)
|
375
|
+
to_fetch
|
376
|
+
end
|
377
|
+
|
378
|
+
def make_scan_worker(jobs, completed)
|
379
|
+
Thread.new do
|
380
|
+
with_profiling do
|
381
|
+
db.cursor_process do |cur|
|
382
|
+
while true
|
383
|
+
req = jobs.poll
|
384
|
+
break unless req
|
385
|
+
begin
|
386
|
+
result = yield(cur, req)
|
387
|
+
completed.put(result)
|
388
|
+
rescue Exception => e
|
389
|
+
completed.put(e)
|
390
|
+
$stderr.puts "Worker failing: #{e.class}: #{e}"
|
391
|
+
$stderr.puts e.backtrace.join("\n")
|
392
|
+
raise e
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
|
401
|
+
# bin_intervals is sorted by zero_start
|
402
|
+
# compute the start and end of all intervals of interest
|
403
|
+
spanning_start = bin_intervals.first.begin
|
404
|
+
spanning_end = bin_intervals.map {|i| i.end}.max
|
405
|
+
# scan from the start of the bin
|
406
|
+
cur.jump(bin_start_prefix(chrom_id, bin))
|
407
|
+
matches = []
|
408
|
+
while pair = cur.get(true)
|
409
|
+
c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
|
410
|
+
if (c_chr != chrom_id) \
|
411
|
+
|| (c_bin != bin) \
|
412
|
+
|| c_start >= spanning_end
|
413
|
+
# we've hit the next bin, or chromosome, or gone past
|
414
|
+
# the spanning interval, so we're done with this bin
|
415
|
+
break
|
416
|
+
end
|
417
|
+
if c_end >= spanning_start # possible overlap
|
418
|
+
# any intervals that end before the start of the current
|
419
|
+
# block are no longer relevant
|
420
|
+
while bin_intervals.first.end < c_start
|
421
|
+
bin_intervals.shift
|
422
|
+
end
|
423
|
+
bin_intervals.each do |i|
|
424
|
+
i_start = i.begin
|
425
|
+
break if i_start > c_end
|
426
|
+
if ((c_start <= i_start && i_start < c_end) \
|
427
|
+
|| i.include?(c_start)) \
|
428
|
+
&& filters.match(pair)
|
429
|
+
# match
|
430
|
+
matches << extract_index_offset(pair)
|
431
|
+
break
|
432
|
+
end
|
433
|
+
end
|
434
|
+
end
|
435
|
+
end
|
436
|
+
matches
|
437
|
+
end
|
438
|
+
|
439
|
+
def overlaps?(gi, i_start, i_end)
|
440
|
+
g_start = gi.begin
|
441
|
+
|
442
|
+
(i_start <= g_start && g_start < i_end) \
|
443
|
+
|| gi.include?(i_start)
|
444
|
+
end
|
445
|
+
|
446
|
+
def build_default(parser)
|
447
|
+
first_block = parser.parse_block
|
448
|
+
ref_seq = first_block.sequences.first.source
|
449
|
+
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
450
|
+
@index_sequences = { ref_seq => 0 }
|
451
|
+
store_index_sequences!
|
452
|
+
index_blocks([first_block])
|
453
|
+
parser.enum_for(:each_block).each_slice(1000).each do |blocks|
|
454
|
+
index_blocks(blocks)
|
455
|
+
end
|
456
|
+
db.synchronize(true)
|
457
|
+
end
|
458
|
+
|
459
|
+
def index_blocks(blocks)
|
460
|
+
h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
|
461
|
+
db.set_bulk(h, false)
|
462
|
+
end
|
463
|
+
|
464
|
+
def load_index_sequences
|
465
|
+
h = {}
|
466
|
+
db.match_prefix("sequence:").each do |key|
|
467
|
+
_, name = key.split(':', 2)
|
468
|
+
id = db[key].to_i
|
469
|
+
h[name] = id
|
470
|
+
end
|
471
|
+
@index_sequences = h
|
472
|
+
end
|
473
|
+
|
474
|
+
def store_index_sequences!
|
475
|
+
index_sequences.each do |name, id|
|
476
|
+
db.set("sequence:#{name}", id.to_s)
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
def load_species
|
481
|
+
db.match_prefix("species:").each do |key|
|
482
|
+
_, name = key.split(':', 2)
|
483
|
+
id = db[key].to_i
|
484
|
+
@species[name] = id
|
485
|
+
end
|
486
|
+
@species_max_id = @species.values.sort.last || -1
|
487
|
+
end
|
488
|
+
|
489
|
+
def species_id_for_seq(seq)
|
490
|
+
# NB can have multiple dots
|
491
|
+
# example: otoGar1.scaffold_104707.1-93001
|
492
|
+
parts = seq.split('.', 2)
|
493
|
+
if parts.size == 2
|
494
|
+
species_name = parts[0]
|
495
|
+
if species.has_key? species_name
|
496
|
+
return species[species_name]
|
497
|
+
else
|
498
|
+
species_id = @species_max_id + 1
|
499
|
+
if species_id >= MAX_SPECIES
|
500
|
+
raise "cannot index MAF file with more than #{MAX_SPECIES} species"
|
501
|
+
end
|
502
|
+
species[species_name] = species_id
|
503
|
+
db["species:#{species_name}"] = species_id
|
504
|
+
@species_max_id = species_id
|
505
|
+
return species_id
|
506
|
+
end
|
507
|
+
else
|
508
|
+
# not in species.sequence format, apparently
|
509
|
+
return nil
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
def build_block_value(block)
|
514
|
+
bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
|
515
|
+
vec = bits.reduce(0, :|)
|
516
|
+
return [block.offset,
|
517
|
+
block.size,
|
518
|
+
block.text_size,
|
519
|
+
block.sequences.size,
|
520
|
+
vec].pack(VAL_FMT)
|
521
|
+
end
|
522
|
+
|
523
|
+
def entries_for(block)
|
524
|
+
h = {}
|
525
|
+
val = build_block_value(block)
|
526
|
+
block.sequences.each do |seq|
|
527
|
+
seq_id = index_sequences[seq.source]
|
528
|
+
next unless seq_id
|
529
|
+
seq_end = seq.start + seq.size
|
530
|
+
bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
|
531
|
+
key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
|
532
|
+
h[key] = val
|
533
|
+
end
|
534
|
+
return h
|
535
|
+
end
|
536
|
+
end # class KyotoIndex
|
537
|
+
|
538
|
+
class Filter
|
539
|
+
include KVHelpers
|
540
|
+
|
541
|
+
def call(e)
|
542
|
+
match(e)
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
class AllSpeciesFilter < Filter
|
547
|
+
attr_reader :bs
|
548
|
+
def initialize(species, idx)
|
549
|
+
ids = species.collect {|s| 1 << idx.species.fetch(s) }
|
550
|
+
@mask = ids.reduce(0, :|)
|
551
|
+
end
|
552
|
+
|
553
|
+
def match(entry)
|
554
|
+
vec = extract_species_vec(entry)
|
555
|
+
(@mask & vec) == @mask
|
556
|
+
end
|
557
|
+
end
|
558
|
+
|
559
|
+
class AtLeastNSequencesFilter < Filter
|
560
|
+
attr_reader :n
|
561
|
+
def initialize(n, idx)
|
562
|
+
@n = n
|
563
|
+
end
|
564
|
+
|
565
|
+
def match(entry)
|
566
|
+
extract_n_sequences(entry) >= @n
|
567
|
+
end
|
568
|
+
end
|
569
|
+
|
570
|
+
class MaxSizeFilter < Filter
|
571
|
+
def initialize(n, idx)
|
572
|
+
@n = n
|
573
|
+
end
|
574
|
+
def match(entry)
|
575
|
+
extract_text_size(entry) <= @n
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
class MinSizeFilter < Filter
|
580
|
+
def initialize(n, idx)
|
581
|
+
@n = n
|
582
|
+
end
|
583
|
+
def match(entry)
|
584
|
+
extract_text_size(entry) >= @n
|
585
|
+
end
|
586
|
+
end
|
587
|
+
|
588
|
+
class Filters
|
589
|
+
include KVHelpers
|
590
|
+
|
591
|
+
FILTER_CLASSES = {
|
592
|
+
:with_all_species => MAF::AllSpeciesFilter,
|
593
|
+
:at_least_n_sequences => MAF::AtLeastNSequencesFilter,
|
594
|
+
:min_size => MAF::MinSizeFilter,
|
595
|
+
:max_size => MAF::MaxSizeFilter
|
596
|
+
}
|
597
|
+
|
598
|
+
def self.build(spec, idx)
|
599
|
+
l = spec.collect do |key, val|
|
600
|
+
if FILTER_CLASSES.has_key? key
|
601
|
+
FILTER_CLASSES[key].new(val, idx)
|
602
|
+
else
|
603
|
+
raise "Unsupported filter key #{key}!"
|
604
|
+
end
|
605
|
+
end
|
606
|
+
return Filters.new(l)
|
607
|
+
end
|
608
|
+
|
609
|
+
def initialize(l)
|
610
|
+
@l = l
|
611
|
+
end
|
612
|
+
|
613
|
+
def match(entry)
|
614
|
+
return ! @l.find { |f| ! f.call(entry) }
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
end # module MAF
|
619
|
+
|
620
|
+
end
|