bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,620 @@
1
+ require 'kyotocabinet'
2
+ require 'jruby/profiler' if RUBY_PLATFORM == 'java'
3
+
4
+ #require 'bio-ucsc-api'
5
+ require 'bio-genomic-interval'
6
+
7
+ module Bio
8
+
9
+ module MAF
10
+
11
+ # Binary record packing and unpacking.
12
+ # @api private
13
+ module KVHelpers
14
+
15
+ KEY = Struct.new([[:marker, :uint8],
16
+ [:seq_id, :uint8],
17
+ [:bin, :uint16],
18
+ [:seq_start, :uint32],
19
+ [:seq_end, :uint32]])
20
+
21
+ VAL = Struct.new([[:offset, :uint64],
22
+ [:length, :uint32],
23
+ [:text_size, :uint32],
24
+ [:n_seq, :uint8],
25
+ [:species_vec, :uint64]])
26
+
27
+ KEY_FMT = KEY.fmt
28
+ KEY_SCAN_FMT = KEY.extractor_fmt(:seq_id, :bin, :seq_start, :seq_end)
29
+ CHROM_BIN_PREFIX_FMT = KEY.extractor_fmt(:marker, :seq_id, :bin)
30
+
31
+ VAL_FMT = VAL.fmt
32
+ VAL_IDX_OFFSET_FMT = VAL.extractor_fmt(:offset, :length)
33
+ VAL_TEXT_SIZE_FMT = VAL.extractor_fmt(:text_size)
34
+ VAL_N_SEQ_FMT = VAL.extractor_fmt(:n_seq)
35
+ VAL_SPECIES_FMT = VAL.extractor_fmt(:species_vec)
36
+
37
+ module_function
38
+
39
+ def extract_species_vec(entry)
40
+ entry[1].unpack(VAL_SPECIES_FMT)[0]
41
+ end
42
+
43
+ def extract_n_sequences(entry)
44
+ entry[1].unpack(VAL_N_SEQ_FMT)[0]
45
+ end
46
+
47
+ def extract_index_offset(entry)
48
+ entry[1].unpack(VAL_IDX_OFFSET_FMT)
49
+ end
50
+
51
+ def extract_text_size(entry)
52
+ entry[1].unpack(VAL_TEXT_SIZE_FMT)[0]
53
+ end
54
+
55
+ def unpack_key(ks)
56
+ ks.unpack(KEY_FMT)
57
+ end
58
+
59
+ def bin_start_prefix(chrom_id, bin)
60
+ [0xFF, chrom_id, bin].pack(CHROM_BIN_PREFIX_FMT)
61
+ end
62
+ end
63
+
64
+ class KyotoIndex
65
+ include KVHelpers
66
+
67
+ attr_reader :db, :species, :species_max_id
68
+ attr_accessor :index_sequences
69
+
70
+ FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
+ FORMAT_VERSION = 2
72
+ MAX_SPECIES = 64
73
+
74
+ ## Key-value store index format
75
+ ##
76
+ ## This format is designed for Kyoto Cabinet but should work on
77
+ ## other key-value databases allowing binary data.
78
+ ##
79
+ ## Index metadata is stored as ASCII text, but index data is
80
+ ## stored as packed binary values.
81
+ ##
82
+ ## Index metadata:
83
+ ##
84
+ ## Sequence IDs:
85
+ ## sequence:<name> => <id>
86
+ ##
87
+ ## Each indexed sequence has a corresponding entry of this
88
+ ## kind. The <name> parameter is the sequence or chromosome
89
+ ## name as found in the MAF file, e.g. mm8.chr7. The <id>
90
+ ## parameter is assigned when the sequence is indexed, and
91
+ ## can be from 0 to 255.
92
+ ##
93
+ ## Species IDs:
94
+ ## species:<name> => <id>
95
+ ##
96
+ ## Each indexed species has a corresponding entry of this
97
+ ## kind. The <name> parameter is the species part of the
98
+ ## sequence name as found in the MAF file, e.g. 'mm8' for
99
+ ## 'mm8.chr7'. The <id> parameter is assigned when the
100
+ ## species is indexed, and can be from 0 to 255.
101
+ ##
102
+ ## Index data:
103
+ ##
104
+ ## For each sequence upon which an index is built, one index
105
+ ## entry is generated per MAF alignment block. The key
106
+ ## identifies the sequence, the UCSC index bin, and the
107
+ ## zero-based start and end positions of the sequence. The
108
+ ## value gives the offset and size of the alignment block
109
+ ## within the MAF file.
110
+ ##
111
+ ## All values are stored as big-endian, unsigned packed binary
112
+ ## data.
113
+ ##
114
+ ## Keys: (12 bytes) [CCS>L>L>]
115
+ ##
116
+ ## 0xFF (1 byte):
117
+ ## index entry prefix
118
+ ## Sequence chromosome ID (1 byte):
119
+ ## corresponds to sequence:<name> entries
120
+ ## UCSC bin (16 bits)
121
+ ## Sequence start, zero-based, inclusive (32 bits)
122
+ ## Sequence end, zero-based, exclusive (32 bits)
123
+ ##
124
+ ## Values (25 bytes) [Q>L>L>CQ>]
125
+ ##
126
+ ## MAF file offset (64 bits)
127
+ ## MAF alignment block length (32 bits)
128
+ ## Block text size (32 bits)
129
+ ## Number of sequences in block (8 bits)
130
+ ## Species bit vector (64 bits)
131
+ ##
132
+ ## Example:
133
+ ##
134
+ ## For a block with sequence 0, bin 1195, start 80082334, end
135
+ ## 80082368, MAF offset 16, and MAF block length 1087:
136
+ ##
137
+ ## | |id| bin | seq_start | seq_end |
138
+ ## key: FF 00 04 AB 04 C5 F5 9E 04 C5 F5 C0
139
+ ##
140
+ ## | offset | length | ts |ns| species_vec |
141
+ ## val: 00 00 00 00 00 00 00 10 00 00 04 3F [TODO]
142
+
143
+ #### Public API
144
+
145
+ # Open an existing index for reading.
146
+ # @param [String] path path to existing Kyoto Cabinet index
147
+ # @return [KyotoIndex]
148
+ def self.open(path)
149
+ return KyotoIndex.new(path)
150
+ end
151
+
152
+ # Build a new index from the MAF file being parsed by `parser`,
153
+ # and store it in `path`.
154
+ # @param [Parser] parser MAF parser for file to index
155
+ # @param [String] path path to index file to create
156
+ # @return [KyotoIndex]
157
+ def self.build(parser, path)
158
+ idx = self.new(path)
159
+ idx.build_default(parser)
160
+ return idx
161
+ end
162
+
163
+ # Find all alignment blocks in the genomic regions in the list
164
+ # of Bio::GenomicInterval objects, and parse them with the given
165
+ # parser.
166
+ #
167
+ # An optional Hash of filters may be passed in. The following
168
+ # keys are used:
169
+ #
170
+ # * `:with_all_species => ["sp1", "sp2", ...]`
171
+ #
172
+ # Only match alignment blocks containing all given species.
173
+ #
174
+ # * `:at_least_n_sequences => n`
175
+ #
176
+ # Only match alignment blocks with at least N sequences.
177
+ #
178
+ # * `:min_size => n`
179
+ #
180
+ # Only match alignment blocks with text size at least N.
181
+ #
182
+ # * `:max_size => n`
183
+ #
184
+ # Only match alignment blocks with text size at most N.
185
+ #
186
+ # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
187
+ # intervals to parse.
188
+ # @param [Parser] parser MAF parser for file to fetch blocks
189
+ # from.
190
+ # @param [Hash] filter Block filter expression.
191
+ # @return [Array<Block>]
192
+ # @api public
193
+ def find(intervals, parser, filter={})
194
+ start = Time.now
195
+ fl = fetch_list(intervals, filter)
196
+ $stderr.printf("Built fetch list of %d items in %.3fs.\n",
197
+ fl.size,
198
+ Time.now - start)
199
+ parser.fetch_blocks(fl)
200
+ end
201
+
202
+ # Close the underlying Kyoto Cabinet database handle.
203
+ def close
204
+ db.close
205
+ end
206
+
207
+ #### KyotoIndex Internals
208
+ # @api private
209
+
210
+ def initialize(path, db_arg=nil)
211
+ @species = {}
212
+ @species_max_id = -1
213
+ if db_arg || ((path.size > 1) and File.exist?(path))
214
+ mode = KyotoCabinet::DB::OREADER
215
+ else
216
+ mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
217
+ end
218
+ @db = db_arg || KyotoCabinet::DB.new
219
+ @path = path
220
+ unless db_arg || db.open(path.to_s, mode)
221
+ raise "Could not open DB file!"
222
+ end
223
+ if mode == KyotoCabinet::DB::OREADER
224
+ load_index_sequences
225
+ load_species
226
+ end
227
+ end
228
+
229
+ # Reopen the same DB handle read-only. Only useful for unit tests.
230
+ def reopen
231
+ KyotoIndex.new(@path, @db)
232
+ end
233
+
234
+ def dump(stream=$stdout)
235
+ stream.puts "KyotoIndex dump: #{@path}"
236
+ stream.puts
237
+ if db.count == 0
238
+ stream.puts "Empty database!"
239
+ return
240
+ end
241
+ db.cursor_process do |cur|
242
+ stream.puts "== Metadata =="
243
+ cur.jump('')
244
+ while true
245
+ k, v = cur.get(false)
246
+ raise "unexpected end of records!" unless k
247
+ break if k[0] == "\xff"
248
+ stream.puts "#{k}: #{v}"
249
+ unless cur.step
250
+ raise "could not advance cursor!"
251
+ end
252
+ end
253
+ stream.puts "== Index records =="
254
+ while pair = cur.get(true)
255
+ _, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
256
+ offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
257
+ stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
258
+ stream.puts " offset #{offset}, length #{len}"
259
+ stream.puts " text size: #{text_size}"
260
+ stream.puts " sequences in block: #{n_seq}"
261
+ stream.printf(" species vector: %016x\n", species_vec)
262
+ end
263
+ end
264
+ end
265
+
266
+ ## Retrieval:
267
+ ## 1. merge the intervals of interest
268
+ ## 2. for each interval, compute the bins with #bin_all
269
+ ## 3. for each bin to search, make a list of intervals of
270
+ ## interest
271
+ ## 4. compute the spanning interval for that bin
272
+ ## 5. start at the beginning of the bin
273
+ ## 6. if a record intersects the spanning interval:
274
+ ## A. #find an interval it intersects
275
+ ## B. if found, add to the fetch list
276
+ ## 7. if a record starts past the end of the spanning interval,
277
+ ## we are done scanning this bin.
278
+ ##
279
+ ## Optimizations:
280
+ ## * once we reach the start of the spanning interval,
281
+ ## all records start in it until we see a record starting
282
+ ## past it.
283
+ ## * as record starts pass the start of intervals of interest,
284
+ ## pull those intervals off the list
285
+
286
+ # Build a fetch list of alignment blocks to read, given an array
287
+ # of Bio::GenomicInterval objects
288
+ def fetch_list(intervals, filter_spec={})
289
+ start = Time.now
290
+ filter_spec ||= {}
291
+ filters = Filters.build(filter_spec, self)
292
+ chrom = intervals.first.chrom
293
+ chrom_id = index_sequences[chrom]
294
+ unless chrom_id
295
+ raise "chromosome #{chrom} not indexed!"
296
+ end
297
+ if intervals.find { |i| i.chrom != chrom }
298
+ raise "all intervals must be for the same chromosome!"
299
+ end
300
+ # for each bin, build a list of the intervals to look for there
301
+ bin_intervals = Hash.new { |h, k| h[k] = [] }
302
+ intervals.each do |i|
303
+ i.bin_all.each do |bin|
304
+ bin_intervals[bin] << (i.zero_start...i.zero_end)
305
+ end
306
+ end
307
+ bin_intervals.values.each do |intervals|
308
+ intervals.sort_by! {|i| i.begin}
309
+ end
310
+ ready = Time.now
311
+ $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
+ if RUBY_PLATFORM == 'java'
313
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
314
+ else
315
+ scan_bins(chrom_id, bin_intervals, filters)
316
+ end
317
+ end # #fetch_list
318
+
319
+ # Scan the index for blocks matching the given bins and intervals.
320
+ def scan_bins(chrom_id, bin_intervals, filters)
321
+ to_fetch = []
322
+ db.cursor_process do |cur|
323
+ bin_intervals.each do |bin, bin_intervals_raw|
324
+ matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
325
+ to_fetch.concat(matches)
326
+ end
327
+ end
328
+ to_fetch
329
+ end
330
+
331
+ def with_profiling
332
+ if RUBY_PLATFORM == 'java' && ENV['profile']
333
+ rv = nil
334
+ pdata = JRuby::Profiler.profile do
335
+ rv = yield
336
+ end
337
+ printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
338
+ printer.printProfile(STDERR)
339
+ return rv
340
+ else
341
+ yield
342
+ end
343
+ end
344
+
345
+ def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
+ start = Time.now
347
+ n_threads = ENV['profile'] ? 1 : 4
348
+ jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
+ completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
+ threads = []
351
+ n_threads.times do
352
+ threads << make_scan_worker(jobs, completed) do |cur, req|
353
+ bin, intervals = req
354
+ scan_bin(cur, chrom_id, bin, intervals, filters)
355
+ end
356
+ end
357
+ n_completed = 0
358
+ to_fetch = []
359
+ while (n_completed < bin_intervals.size)
360
+ c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
361
+ if c.nil?
362
+ if threads.find { |t| t.alive? }
363
+ next
364
+ else
365
+ raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
366
+ end
367
+ end
368
+ raise "worker failed: #{c}" if c.is_a? Exception
369
+ to_fetch.concat(c)
370
+ n_completed += 1
371
+ end
372
+ threads.each { |t| t.join }
373
+ $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
374
+ to_fetch.size, n_threads, Time.now - start)
375
+ to_fetch
376
+ end
377
+
378
+ def make_scan_worker(jobs, completed)
379
+ Thread.new do
380
+ with_profiling do
381
+ db.cursor_process do |cur|
382
+ while true
383
+ req = jobs.poll
384
+ break unless req
385
+ begin
386
+ result = yield(cur, req)
387
+ completed.put(result)
388
+ rescue Exception => e
389
+ completed.put(e)
390
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
391
+ $stderr.puts e.backtrace.join("\n")
392
+ raise e
393
+ end
394
+ end
395
+ end
396
+ end
397
+ end
398
+ end
399
+
400
+ def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
401
+ # bin_intervals is sorted by zero_start
402
+ # compute the start and end of all intervals of interest
403
+ spanning_start = bin_intervals.first.begin
404
+ spanning_end = bin_intervals.map {|i| i.end}.max
405
+ # scan from the start of the bin
406
+ cur.jump(bin_start_prefix(chrom_id, bin))
407
+ matches = []
408
+ while pair = cur.get(true)
409
+ c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
410
+ if (c_chr != chrom_id) \
411
+ || (c_bin != bin) \
412
+ || c_start >= spanning_end
413
+ # we've hit the next bin, or chromosome, or gone past
414
+ # the spanning interval, so we're done with this bin
415
+ break
416
+ end
417
+ if c_end >= spanning_start # possible overlap
418
+ # any intervals that end before the start of the current
419
+ # block are no longer relevant
420
+ while bin_intervals.first.end < c_start
421
+ bin_intervals.shift
422
+ end
423
+ bin_intervals.each do |i|
424
+ i_start = i.begin
425
+ break if i_start > c_end
426
+ if ((c_start <= i_start && i_start < c_end) \
427
+ || i.include?(c_start)) \
428
+ && filters.match(pair)
429
+ # match
430
+ matches << extract_index_offset(pair)
431
+ break
432
+ end
433
+ end
434
+ end
435
+ end
436
+ matches
437
+ end
438
+
439
+ def overlaps?(gi, i_start, i_end)
440
+ g_start = gi.begin
441
+
442
+ (i_start <= g_start && g_start < i_end) \
443
+ || gi.include?(i_start)
444
+ end
445
+
446
+ def build_default(parser)
447
+ first_block = parser.parse_block
448
+ ref_seq = first_block.sequences.first.source
449
+ db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
+ @index_sequences = { ref_seq => 0 }
451
+ store_index_sequences!
452
+ index_blocks([first_block])
453
+ parser.enum_for(:each_block).each_slice(1000).each do |blocks|
454
+ index_blocks(blocks)
455
+ end
456
+ db.synchronize(true)
457
+ end
458
+
459
+ def index_blocks(blocks)
460
+ h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
461
+ db.set_bulk(h, false)
462
+ end
463
+
464
+ def load_index_sequences
465
+ h = {}
466
+ db.match_prefix("sequence:").each do |key|
467
+ _, name = key.split(':', 2)
468
+ id = db[key].to_i
469
+ h[name] = id
470
+ end
471
+ @index_sequences = h
472
+ end
473
+
474
+ def store_index_sequences!
475
+ index_sequences.each do |name, id|
476
+ db.set("sequence:#{name}", id.to_s)
477
+ end
478
+ end
479
+
480
+ def load_species
481
+ db.match_prefix("species:").each do |key|
482
+ _, name = key.split(':', 2)
483
+ id = db[key].to_i
484
+ @species[name] = id
485
+ end
486
+ @species_max_id = @species.values.sort.last || -1
487
+ end
488
+
489
+ def species_id_for_seq(seq)
490
+ # NB can have multiple dots
491
+ # example: otoGar1.scaffold_104707.1-93001
492
+ parts = seq.split('.', 2)
493
+ if parts.size == 2
494
+ species_name = parts[0]
495
+ if species.has_key? species_name
496
+ return species[species_name]
497
+ else
498
+ species_id = @species_max_id + 1
499
+ if species_id >= MAX_SPECIES
500
+ raise "cannot index MAF file with more than #{MAX_SPECIES} species"
501
+ end
502
+ species[species_name] = species_id
503
+ db["species:#{species_name}"] = species_id
504
+ @species_max_id = species_id
505
+ return species_id
506
+ end
507
+ else
508
+ # not in species.sequence format, apparently
509
+ return nil
510
+ end
511
+ end
512
+
513
+ def build_block_value(block)
514
+ bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
515
+ vec = bits.reduce(0, :|)
516
+ return [block.offset,
517
+ block.size,
518
+ block.text_size,
519
+ block.sequences.size,
520
+ vec].pack(VAL_FMT)
521
+ end
522
+
523
+ def entries_for(block)
524
+ h = {}
525
+ val = build_block_value(block)
526
+ block.sequences.each do |seq|
527
+ seq_id = index_sequences[seq.source]
528
+ next unless seq_id
529
+ seq_end = seq.start + seq.size
530
+ bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
531
+ key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
532
+ h[key] = val
533
+ end
534
+ return h
535
+ end
536
+ end # class KyotoIndex
537
+
538
+ class Filter
539
+ include KVHelpers
540
+
541
+ def call(e)
542
+ match(e)
543
+ end
544
+ end
545
+
546
+ class AllSpeciesFilter < Filter
547
+ attr_reader :bs
548
+ def initialize(species, idx)
549
+ ids = species.collect {|s| 1 << idx.species.fetch(s) }
550
+ @mask = ids.reduce(0, :|)
551
+ end
552
+
553
+ def match(entry)
554
+ vec = extract_species_vec(entry)
555
+ (@mask & vec) == @mask
556
+ end
557
+ end
558
+
559
+ class AtLeastNSequencesFilter < Filter
560
+ attr_reader :n
561
+ def initialize(n, idx)
562
+ @n = n
563
+ end
564
+
565
+ def match(entry)
566
+ extract_n_sequences(entry) >= @n
567
+ end
568
+ end
569
+
570
+ class MaxSizeFilter < Filter
571
+ def initialize(n, idx)
572
+ @n = n
573
+ end
574
+ def match(entry)
575
+ extract_text_size(entry) <= @n
576
+ end
577
+ end
578
+
579
+ class MinSizeFilter < Filter
580
+ def initialize(n, idx)
581
+ @n = n
582
+ end
583
+ def match(entry)
584
+ extract_text_size(entry) >= @n
585
+ end
586
+ end
587
+
588
+ class Filters
589
+ include KVHelpers
590
+
591
+ FILTER_CLASSES = {
592
+ :with_all_species => MAF::AllSpeciesFilter,
593
+ :at_least_n_sequences => MAF::AtLeastNSequencesFilter,
594
+ :min_size => MAF::MinSizeFilter,
595
+ :max_size => MAF::MaxSizeFilter
596
+ }
597
+
598
+ def self.build(spec, idx)
599
+ l = spec.collect do |key, val|
600
+ if FILTER_CLASSES.has_key? key
601
+ FILTER_CLASSES[key].new(val, idx)
602
+ else
603
+ raise "Unsupported filter key #{key}!"
604
+ end
605
+ end
606
+ return Filters.new(l)
607
+ end
608
+
609
+ def initialize(l)
610
+ @l = l
611
+ end
612
+
613
+ def match(entry)
614
+ return ! @l.find { |f| ! f.call(entry) }
615
+ end
616
+ end
617
+
618
+ end # module MAF
619
+
620
+ end