bio-maf 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,20 @@
1
+ When /^build an index on the reference sequence$/ do
2
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
+ end
4
+
5
+ Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
+ @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
+ end
8
+
9
+ Then /^the index has at least (\d+) entries$/ do |size_spec|
10
+ @idx.db.count.should be >= size_spec.to_i
11
+ end
12
+
13
+ When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
+ int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([int], @parser, @block_filter).to_a
16
+ end
17
+
18
+ Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
+ @blocks.size.should == num.to_i
20
+ end
@@ -0,0 +1,27 @@
1
+ When /^open a new MAF writer$/ do
2
+ @dst = Tempfile.new(["cuke", ".maf"])
3
+ @writer = Bio::MAF::Writer.new(@dst)
4
+ end
5
+
6
+ When /^write the header from the original MAF file$/ do
7
+ @writer.write_header(@parser.header)
8
+ end
9
+
10
+ When /^write all the parsed blocks$/ do
11
+ @writer.write_blocks(@parser.parse_blocks)
12
+ end
13
+
14
+ RSpec::Matchers.define :match_except_ws do |expected|
15
+ match do |actual|
16
+ system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
17
+ end
18
+
19
+ failure_message_for_should do |actual|
20
+ msg = "File contents did not match. Diff:\n"
21
+ msg << `diff --unified --ignore-space-change #{expected} #{actual}`
22
+ end
23
+ end
24
+
25
+ Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
26
+ @dst.path.should match_except_ws($test_data + ref)
27
+ end
@@ -0,0 +1,63 @@
1
+ When /^I open it with a MAF reader$/ do
2
+ @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
3
+ end
4
+
5
+ When /^I enable the :(\S+) parser option$/ do |opt_s|
6
+ @opts ||= {}
7
+ @opts[opt_s.to_sym] = true
8
+ end
9
+
10
+ Then /^the MAF version should be "(.*?)"$/ do |v_spec|
11
+ @parser.header.version.to_s.should == v_spec
12
+ end
13
+
14
+ Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
15
+ @parser.header.scoring.should == s_spec
16
+ end
17
+
18
+ Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
19
+ @parser.header.alignment_params.should == a_spec
20
+ end
21
+
22
+ Then /^an alignment block can be obtained$/ do
23
+ @block = @parser.parse_block
24
+ @block.should_not be_nil
25
+ end
26
+
27
+ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
28
+ @block.sequences.size.should == n_seq.to_i
29
+ end
30
+
31
+ Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
32
+ method_sym = method.gsub(/ /, '_').to_sym
33
+ @block.raw_seq(i.to_i).send(method_sym).should == str
34
+ end
35
+
36
+ Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
37
+ method_sym = method.gsub(/ /, '_').to_sym
38
+ @block.raw_seq(i.to_i).send(method_sym).should == num.to_i
39
+ end
40
+
41
+ Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
42
+ method_sym = method.gsub(/ /, '_').to_sym
43
+ value_sym = sym_s.to_sym
44
+ @block.raw_seq(i.to_i).send(method_sym).should == value_sym
45
+ end
46
+
47
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
48
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
49
+ method_sym = method.gsub(/ /, '_').to_sym
50
+ seq.send(method_sym).should == str
51
+ end
52
+
53
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
54
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
55
+ method_sym = method.gsub(/ /, '_').to_sym
56
+ seq.send(method_sym).should == num.to_i
57
+ end
58
+
59
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
60
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
61
+ method_sym = method.gsub(/ /, '_').to_sym
62
+ seq.send(method_sym).should == sym_s.to_sym
63
+ end
@@ -0,0 +1,31 @@
1
+ When /^filter for only the species$/ do |table|
2
+ # table is a Cucumber::Ast::Table
3
+ sp = table.raw.collect { |row| row[0] }
4
+ @parser.sequence_filter = { :only_species => sp }
5
+ end
6
+
7
+ When /^filter for blocks with the species$/ do |table|
8
+ # table is a Cucumber::Ast::Table
9
+ sp = table.raw.collect { |row| row[0] }
10
+ @block_filter = { :with_all_species => sp }
11
+ end
12
+
13
+ When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ @block_filter = { :at_least_n_sequences => n.to_i }
15
+ end
16
+
17
+ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ constraint = case op
19
+ when 'least' then :min_size
20
+ when 'most' then :max_size
21
+ else raise "bad operator #{op}!"
22
+ end
23
+ @block_filter = { constraint => len.to_i}
24
+ end
25
+
26
+ When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ @block_filter = {
28
+ :min_size => min.to_i,
29
+ :max_size => max.to_i
30
+ }
31
+ end
@@ -0,0 +1,14 @@
1
+ #require 'bio-ucsc-api'
2
+
3
+ Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
4
+ @r_start = r_start.to_i
5
+ @r_end = r_end.to_i
6
+ end
7
+
8
+ When /^I compute the smallest containing bin$/ do
9
+ @bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
10
+ end
11
+
12
+ Then /^the bin should be (\d+)$/ do |expected_bin|
13
+ @bin.should == expected_bin.to_i
14
+ end
@@ -0,0 +1,16 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'pathname'
10
+ require 'tempfile'
11
+
12
+ $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
13
+
14
+ require 'bio-maf'
15
+
16
+ $test_data = Pathname.new 'test/data'
@@ -0,0 +1,24 @@
1
+ Feature: Computation of UCSC bins
2
+ In order to efficiently use indexes
3
+ We will use the UCSC bin indexing system
4
+ Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
5
+
6
+ Scenario Outline: Compute smallest containing bin
7
+ Given I have a region with start <Start> and end <End>
8
+ When I compute the smallest containing bin
9
+ Then the bin should be <Bin>
10
+
11
+ Examples:
12
+ | Start | End | Bin |
13
+ | 25079603 | 25079787 | 776 |
14
+ | 25128173 | 25128248 | 776 |
15
+ | 50312474 | 50312703 | 968 |
16
+ | 41905591 | 41906101 | 904 |
17
+ | 16670899 | 16673060 | 712 |
18
+ | 75495356 | 75495494 | 1160 |
19
+ | 92259501 | 92261053 | 1288 |
20
+ | 83834063 | 83838132 | 1224 |
21
+ | 7309597 | 7310411 | 640 |
22
+ | 6190410 | 6190999 | 632 |
23
+ # from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
24
+
@@ -0,0 +1,12 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio/ucsc'
12
+ require 'bio/maf'
@@ -0,0 +1,3 @@
1
+ module BioMaf
2
+
3
+ end
@@ -0,0 +1,4 @@
1
+ require 'bio/maf/struct'
2
+ require 'bio/maf/index'
3
+ require 'bio/maf/parser'
4
+ require 'bio/maf/writer'
@@ -0,0 +1,620 @@
1
+ require 'kyotocabinet'
2
+ require 'jruby/profiler' if RUBY_PLATFORM == 'java'
3
+
4
+ #require 'bio-ucsc-api'
5
+ require 'bio-genomic-interval'
6
+
7
+ module Bio
8
+
9
+ module MAF
10
+
11
+ # Binary record packing and unpacking.
12
+ # @api private
13
+ module KVHelpers
14
+
15
+ KEY = Struct.new([[:marker, :uint8],
16
+ [:seq_id, :uint8],
17
+ [:bin, :uint16],
18
+ [:seq_start, :uint32],
19
+ [:seq_end, :uint32]])
20
+
21
+ VAL = Struct.new([[:offset, :uint64],
22
+ [:length, :uint32],
23
+ [:text_size, :uint32],
24
+ [:n_seq, :uint8],
25
+ [:species_vec, :uint64]])
26
+
27
+ KEY_FMT = KEY.fmt
28
+ KEY_SCAN_FMT = KEY.extractor_fmt(:seq_id, :bin, :seq_start, :seq_end)
29
+ CHROM_BIN_PREFIX_FMT = KEY.extractor_fmt(:marker, :seq_id, :bin)
30
+
31
+ VAL_FMT = VAL.fmt
32
+ VAL_IDX_OFFSET_FMT = VAL.extractor_fmt(:offset, :length)
33
+ VAL_TEXT_SIZE_FMT = VAL.extractor_fmt(:text_size)
34
+ VAL_N_SEQ_FMT = VAL.extractor_fmt(:n_seq)
35
+ VAL_SPECIES_FMT = VAL.extractor_fmt(:species_vec)
36
+
37
+ module_function
38
+
39
+ def extract_species_vec(entry)
40
+ entry[1].unpack(VAL_SPECIES_FMT)[0]
41
+ end
42
+
43
+ def extract_n_sequences(entry)
44
+ entry[1].unpack(VAL_N_SEQ_FMT)[0]
45
+ end
46
+
47
+ def extract_index_offset(entry)
48
+ entry[1].unpack(VAL_IDX_OFFSET_FMT)
49
+ end
50
+
51
+ def extract_text_size(entry)
52
+ entry[1].unpack(VAL_TEXT_SIZE_FMT)[0]
53
+ end
54
+
55
+ def unpack_key(ks)
56
+ ks.unpack(KEY_FMT)
57
+ end
58
+
59
+ def bin_start_prefix(chrom_id, bin)
60
+ [0xFF, chrom_id, bin].pack(CHROM_BIN_PREFIX_FMT)
61
+ end
62
+ end
63
+
64
+ class KyotoIndex
65
+ include KVHelpers
66
+
67
+ attr_reader :db, :species, :species_max_id
68
+ attr_accessor :index_sequences
69
+
70
+ FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
+ FORMAT_VERSION = 2
72
+ MAX_SPECIES = 64
73
+
74
+ ## Key-value store index format
75
+ ##
76
+ ## This format is designed for Kyoto Cabinet but should work on
77
+ ## other key-value databases allowing binary data.
78
+ ##
79
+ ## Index metadata is stored as ASCII text, but index data is
80
+ ## stored as packed binary values.
81
+ ##
82
+ ## Index metadata:
83
+ ##
84
+ ## Sequence IDs:
85
+ ## sequence:<name> => <id>
86
+ ##
87
+ ## Each indexed sequence has a corresponding entry of this
88
+ ## kind. The <name> parameter is the sequence or chromosome
89
+ ## name as found in the MAF file, e.g. mm8.chr7. The <id>
90
+ ## parameter is assigned when the sequence is indexed, and
91
+ ## can be from 0 to 255.
92
+ ##
93
+ ## Species IDs:
94
+ ## species:<name> => <id>
95
+ ##
96
+ ## Each indexed species has a corresponding entry of this
97
+ ## kind. The <name> parameter is the species part of the
98
+ ## sequence name as found in the MAF file, e.g. 'mm8' for
99
+ ## 'mm8.chr7'. The <id> parameter is assigned when the
100
+ ## species is indexed, and can be from 0 to 255.
101
+ ##
102
+ ## Index data:
103
+ ##
104
+ ## For each sequence upon which an index is built, one index
105
+ ## entry is generated per MAF alignment block. The key
106
+ ## identifies the sequence, the UCSC index bin, and the
107
+ ## zero-based start and end positions of the sequence. The
108
+ ## value gives the offset and size of the alignment block
109
+ ## within the MAF file.
110
+ ##
111
+ ## All values are stored as big-endian, unsigned packed binary
112
+ ## data.
113
+ ##
114
+ ## Keys: (12 bytes) [CCS>L>L>]
115
+ ##
116
+ ## 0xFF (1 byte):
117
+ ## index entry prefix
118
+ ## Sequence chromosome ID (1 byte):
119
+ ## corresponds to sequence:<name> entries
120
+ ## UCSC bin (16 bits)
121
+ ## Sequence start, zero-based, inclusive (32 bits)
122
+ ## Sequence end, zero-based, exclusive (32 bits)
123
+ ##
124
+ ## Values (25 bytes) [Q>L>L>CQ>]
125
+ ##
126
+ ## MAF file offset (64 bits)
127
+ ## MAF alignment block length (32 bits)
128
+ ## Block text size (32 bits)
129
+ ## Number of sequences in block (8 bits)
130
+ ## Species bit vector (64 bits)
131
+ ##
132
+ ## Example:
133
+ ##
134
+ ## For a block with sequence 0, bin 1195, start 80082334, end
135
+ ## 80082368, MAF offset 16, and MAF block length 1087:
136
+ ##
137
+ ## | |id| bin | seq_start | seq_end |
138
+ ## key: FF 00 04 AB 04 C5 F5 9E 04 C5 F5 C0
139
+ ##
140
+ ## | offset | length | ts |ns| species_vec |
141
+ ## val: 00 00 00 00 00 00 00 10 00 00 04 3F [TODO]
142
+
143
+ #### Public API
144
+
145
+ # Open an existing index for reading.
146
+ # @param [String] path path to existing Kyoto Cabinet index
147
+ # @return [KyotoIndex]
148
+ def self.open(path)
149
+ return KyotoIndex.new(path)
150
+ end
151
+
152
+ # Build a new index from the MAF file being parsed by `parser`,
153
+ # and store it in `path`.
154
+ # @param [Parser] parser MAF parser for file to index
155
+ # @param [String] path path to index file to create
156
+ # @return [KyotoIndex]
157
+ def self.build(parser, path)
158
+ idx = self.new(path)
159
+ idx.build_default(parser)
160
+ return idx
161
+ end
162
+
163
+ # Find all alignment blocks in the genomic regions in the list
164
+ # of Bio::GenomicInterval objects, and parse them with the given
165
+ # parser.
166
+ #
167
+ # An optional Hash of filters may be passed in. The following
168
+ # keys are used:
169
+ #
170
+ # * `:with_all_species => ["sp1", "sp2", ...]`
171
+ #
172
+ # Only match alignment blocks containing all given species.
173
+ #
174
+ # * `:at_least_n_sequences => n`
175
+ #
176
+ # Only match alignment blocks with at least N sequences.
177
+ #
178
+ # * `:min_size => n`
179
+ #
180
+ # Only match alignment blocks with text size at least N.
181
+ #
182
+ # * `:max_size => n`
183
+ #
184
+ # Only match alignment blocks with text size at most N.
185
+ #
186
+ # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
187
+ # intervals to parse.
188
+ # @param [Parser] parser MAF parser for file to fetch blocks
189
+ # from.
190
+ # @param [Hash] filter Block filter expression.
191
+ # @return [Array<Block>]
192
+ # @api public
193
+ def find(intervals, parser, filter={})
194
+ start = Time.now
195
+ fl = fetch_list(intervals, filter)
196
+ $stderr.printf("Built fetch list of %d items in %.3fs.\n",
197
+ fl.size,
198
+ Time.now - start)
199
+ parser.fetch_blocks(fl)
200
+ end
201
+
202
+ # Close the underlying Kyoto Cabinet database handle.
203
+ def close
204
+ db.close
205
+ end
206
+
207
+ #### KyotoIndex Internals
208
+ # @api private
209
+
210
+ def initialize(path, db_arg=nil)
211
+ @species = {}
212
+ @species_max_id = -1
213
+ if db_arg || ((path.size > 1) and File.exist?(path))
214
+ mode = KyotoCabinet::DB::OREADER
215
+ else
216
+ mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
217
+ end
218
+ @db = db_arg || KyotoCabinet::DB.new
219
+ @path = path
220
+ unless db_arg || db.open(path.to_s, mode)
221
+ raise "Could not open DB file!"
222
+ end
223
+ if mode == KyotoCabinet::DB::OREADER
224
+ load_index_sequences
225
+ load_species
226
+ end
227
+ end
228
+
229
+ # Reopen the same DB handle read-only. Only useful for unit tests.
230
+ def reopen
231
+ KyotoIndex.new(@path, @db)
232
+ end
233
+
234
+ def dump(stream=$stdout)
235
+ stream.puts "KyotoIndex dump: #{@path}"
236
+ stream.puts
237
+ if db.count == 0
238
+ stream.puts "Empty database!"
239
+ return
240
+ end
241
+ db.cursor_process do |cur|
242
+ stream.puts "== Metadata =="
243
+ cur.jump('')
244
+ while true
245
+ k, v = cur.get(false)
246
+ raise "unexpected end of records!" unless k
247
+ break if k[0] == "\xff"
248
+ stream.puts "#{k}: #{v}"
249
+ unless cur.step
250
+ raise "could not advance cursor!"
251
+ end
252
+ end
253
+ stream.puts "== Index records =="
254
+ while pair = cur.get(true)
255
+ _, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
256
+ offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
257
+ stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
258
+ stream.puts " offset #{offset}, length #{len}"
259
+ stream.puts " text size: #{text_size}"
260
+ stream.puts " sequences in block: #{n_seq}"
261
+ stream.printf(" species vector: %016x\n", species_vec)
262
+ end
263
+ end
264
+ end
265
+
266
+ ## Retrieval:
267
+ ## 1. merge the intervals of interest
268
+ ## 2. for each interval, compute the bins with #bin_all
269
+ ## 3. for each bin to search, make a list of intervals of
270
+ ## interest
271
+ ## 4. compute the spanning interval for that bin
272
+ ## 5. start at the beginning of the bin
273
+ ## 6. if a record intersects the spanning interval:
274
+ ## A. #find an interval it intersects
275
+ ## B. if found, add to the fetch list
276
+ ## 7. if a record starts past the end of the spanning interval,
277
+ ## we are done scanning this bin.
278
+ ##
279
+ ## Optimizations:
280
+ ## * once we reach the start of the spanning interval,
281
+ ## all records start in it until we see a record starting
282
+ ## past it.
283
+ ## * as record starts pass the start of intervals of interest,
284
+ ## pull those intervals off the list
285
+
286
+ # Build a fetch list of alignment blocks to read, given an array
287
+ # of Bio::GenomicInterval objects
288
+ def fetch_list(intervals, filter_spec={})
289
+ start = Time.now
290
+ filter_spec ||= {}
291
+ filters = Filters.build(filter_spec, self)
292
+ chrom = intervals.first.chrom
293
+ chrom_id = index_sequences[chrom]
294
+ unless chrom_id
295
+ raise "chromosome #{chrom} not indexed!"
296
+ end
297
+ if intervals.find { |i| i.chrom != chrom }
298
+ raise "all intervals must be for the same chromosome!"
299
+ end
300
+ # for each bin, build a list of the intervals to look for there
301
+ bin_intervals = Hash.new { |h, k| h[k] = [] }
302
+ intervals.each do |i|
303
+ i.bin_all.each do |bin|
304
+ bin_intervals[bin] << (i.zero_start...i.zero_end)
305
+ end
306
+ end
307
+ bin_intervals.values.each do |intervals|
308
+ intervals.sort_by! {|i| i.begin}
309
+ end
310
+ ready = Time.now
311
+ $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
+ if RUBY_PLATFORM == 'java'
313
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
314
+ else
315
+ scan_bins(chrom_id, bin_intervals, filters)
316
+ end
317
+ end # #fetch_list
318
+
319
+ # Scan the index for blocks matching the given bins and intervals.
320
+ def scan_bins(chrom_id, bin_intervals, filters)
321
+ to_fetch = []
322
+ db.cursor_process do |cur|
323
+ bin_intervals.each do |bin, bin_intervals_raw|
324
+ matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
325
+ to_fetch.concat(matches)
326
+ end
327
+ end
328
+ to_fetch
329
+ end
330
+
331
+ def with_profiling
332
+ if RUBY_PLATFORM == 'java' && ENV['profile']
333
+ rv = nil
334
+ pdata = JRuby::Profiler.profile do
335
+ rv = yield
336
+ end
337
+ printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
338
+ printer.printProfile(STDERR)
339
+ return rv
340
+ else
341
+ yield
342
+ end
343
+ end
344
+
345
+ def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
+ start = Time.now
347
+ n_threads = ENV['profile'] ? 1 : 4
348
+ jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
+ completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
+ threads = []
351
+ n_threads.times do
352
+ threads << make_scan_worker(jobs, completed) do |cur, req|
353
+ bin, intervals = req
354
+ scan_bin(cur, chrom_id, bin, intervals, filters)
355
+ end
356
+ end
357
+ n_completed = 0
358
+ to_fetch = []
359
+ while (n_completed < bin_intervals.size)
360
+ c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
361
+ if c.nil?
362
+ if threads.find { |t| t.alive? }
363
+ next
364
+ else
365
+ raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
366
+ end
367
+ end
368
+ raise "worker failed: #{c}" if c.is_a? Exception
369
+ to_fetch.concat(c)
370
+ n_completed += 1
371
+ end
372
+ threads.each { |t| t.join }
373
+ $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
374
+ to_fetch.size, n_threads, Time.now - start)
375
+ to_fetch
376
+ end
377
+
378
+ def make_scan_worker(jobs, completed)
379
+ Thread.new do
380
+ with_profiling do
381
+ db.cursor_process do |cur|
382
+ while true
383
+ req = jobs.poll
384
+ break unless req
385
+ begin
386
+ result = yield(cur, req)
387
+ completed.put(result)
388
+ rescue Exception => e
389
+ completed.put(e)
390
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
391
+ $stderr.puts e.backtrace.join("\n")
392
+ raise e
393
+ end
394
+ end
395
+ end
396
+ end
397
+ end
398
+ end
399
+
400
+ def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
401
+ # bin_intervals is sorted by zero_start
402
+ # compute the start and end of all intervals of interest
403
+ spanning_start = bin_intervals.first.begin
404
+ spanning_end = bin_intervals.map {|i| i.end}.max
405
+ # scan from the start of the bin
406
+ cur.jump(bin_start_prefix(chrom_id, bin))
407
+ matches = []
408
+ while pair = cur.get(true)
409
+ c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
410
+ if (c_chr != chrom_id) \
411
+ || (c_bin != bin) \
412
+ || c_start >= spanning_end
413
+ # we've hit the next bin, or chromosome, or gone past
414
+ # the spanning interval, so we're done with this bin
415
+ break
416
+ end
417
+ if c_end >= spanning_start # possible overlap
418
+ # any intervals that end before the start of the current
419
+ # block are no longer relevant
420
+ while bin_intervals.first.end < c_start
421
+ bin_intervals.shift
422
+ end
423
+ bin_intervals.each do |i|
424
+ i_start = i.begin
425
+ break if i_start > c_end
426
+ if ((c_start <= i_start && i_start < c_end) \
427
+ || i.include?(c_start)) \
428
+ && filters.match(pair)
429
+ # match
430
+ matches << extract_index_offset(pair)
431
+ break
432
+ end
433
+ end
434
+ end
435
+ end
436
+ matches
437
+ end
438
+
439
+ def overlaps?(gi, i_start, i_end)
440
+ g_start = gi.begin
441
+
442
+ (i_start <= g_start && g_start < i_end) \
443
+ || gi.include?(i_start)
444
+ end
445
+
446
+ def build_default(parser)
447
+ first_block = parser.parse_block
448
+ ref_seq = first_block.sequences.first.source
449
+ db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
+ @index_sequences = { ref_seq => 0 }
451
+ store_index_sequences!
452
+ index_blocks([first_block])
453
+ parser.enum_for(:each_block).each_slice(1000).each do |blocks|
454
+ index_blocks(blocks)
455
+ end
456
+ db.synchronize(true)
457
+ end
458
+
459
+ def index_blocks(blocks)
460
+ h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
461
+ db.set_bulk(h, false)
462
+ end
463
+
464
+ def load_index_sequences
465
+ h = {}
466
+ db.match_prefix("sequence:").each do |key|
467
+ _, name = key.split(':', 2)
468
+ id = db[key].to_i
469
+ h[name] = id
470
+ end
471
+ @index_sequences = h
472
+ end
473
+
474
+ def store_index_sequences!
475
+ index_sequences.each do |name, id|
476
+ db.set("sequence:#{name}", id.to_s)
477
+ end
478
+ end
479
+
480
+ def load_species
481
+ db.match_prefix("species:").each do |key|
482
+ _, name = key.split(':', 2)
483
+ id = db[key].to_i
484
+ @species[name] = id
485
+ end
486
+ @species_max_id = @species.values.sort.last || -1
487
+ end
488
+
489
+ def species_id_for_seq(seq)
490
+ # NB can have multiple dots
491
+ # example: otoGar1.scaffold_104707.1-93001
492
+ parts = seq.split('.', 2)
493
+ if parts.size == 2
494
+ species_name = parts[0]
495
+ if species.has_key? species_name
496
+ return species[species_name]
497
+ else
498
+ species_id = @species_max_id + 1
499
+ if species_id >= MAX_SPECIES
500
+ raise "cannot index MAF file with more than #{MAX_SPECIES} species"
501
+ end
502
+ species[species_name] = species_id
503
+ db["species:#{species_name}"] = species_id
504
+ @species_max_id = species_id
505
+ return species_id
506
+ end
507
+ else
508
+ # not in species.sequence format, apparently
509
+ return nil
510
+ end
511
+ end
512
+
513
+ def build_block_value(block)
514
+ bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
515
+ vec = bits.reduce(0, :|)
516
+ return [block.offset,
517
+ block.size,
518
+ block.text_size,
519
+ block.sequences.size,
520
+ vec].pack(VAL_FMT)
521
+ end
522
+
523
+ def entries_for(block)
524
+ h = {}
525
+ val = build_block_value(block)
526
+ block.sequences.each do |seq|
527
+ seq_id = index_sequences[seq.source]
528
+ next unless seq_id
529
+ seq_end = seq.start + seq.size
530
+ bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
531
+ key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
532
+ h[key] = val
533
+ end
534
+ return h
535
+ end
536
+ end # class KyotoIndex
537
+
538
+ class Filter
539
+ include KVHelpers
540
+
541
+ def call(e)
542
+ match(e)
543
+ end
544
+ end
545
+
546
+ class AllSpeciesFilter < Filter
547
+ attr_reader :bs
548
+ def initialize(species, idx)
549
+ ids = species.collect {|s| 1 << idx.species.fetch(s) }
550
+ @mask = ids.reduce(0, :|)
551
+ end
552
+
553
+ def match(entry)
554
+ vec = extract_species_vec(entry)
555
+ (@mask & vec) == @mask
556
+ end
557
+ end
558
+
559
+ class AtLeastNSequencesFilter < Filter
560
+ attr_reader :n
561
+ def initialize(n, idx)
562
+ @n = n
563
+ end
564
+
565
+ def match(entry)
566
+ extract_n_sequences(entry) >= @n
567
+ end
568
+ end
569
+
570
+ class MaxSizeFilter < Filter
571
+ def initialize(n, idx)
572
+ @n = n
573
+ end
574
+ def match(entry)
575
+ extract_text_size(entry) <= @n
576
+ end
577
+ end
578
+
579
+ class MinSizeFilter < Filter
580
+ def initialize(n, idx)
581
+ @n = n
582
+ end
583
+ def match(entry)
584
+ extract_text_size(entry) >= @n
585
+ end
586
+ end
587
+
588
+ class Filters
589
+ include KVHelpers
590
+
591
+ FILTER_CLASSES = {
592
+ :with_all_species => MAF::AllSpeciesFilter,
593
+ :at_least_n_sequences => MAF::AtLeastNSequencesFilter,
594
+ :min_size => MAF::MinSizeFilter,
595
+ :max_size => MAF::MaxSizeFilter
596
+ }
597
+
598
+ def self.build(spec, idx)
599
+ l = spec.collect do |key, val|
600
+ if FILTER_CLASSES.has_key? key
601
+ FILTER_CLASSES[key].new(val, idx)
602
+ else
603
+ raise "Unsupported filter key #{key}!"
604
+ end
605
+ end
606
+ return Filters.new(l)
607
+ end
608
+
609
+ def initialize(l)
610
+ @l = l
611
+ end
612
+
613
+ def match(entry)
614
+ return ! @l.find { |f| ! f.call(entry) }
615
+ end
616
+ end
617
+
618
+ end # module MAF
619
+
620
+ end