bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,20 @@
1
+ When /^build an index on the reference sequence$/ do
2
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
+ end
4
+
5
+ Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
+ @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
+ end
8
+
9
+ Then /^the index has at least (\d+) entries$/ do |size_spec|
10
+ @idx.db.count.should be >= size_spec.to_i
11
+ end
12
+
13
+ When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
+ int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([int], @parser, @block_filter).to_a
16
+ end
17
+
18
+ Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
+ @blocks.size.should == num.to_i
20
+ end
@@ -0,0 +1,27 @@
1
+ When /^open a new MAF writer$/ do
2
+ @dst = Tempfile.new(["cuke", ".maf"])
3
+ @writer = Bio::MAF::Writer.new(@dst)
4
+ end
5
+
6
+ When /^write the header from the original MAF file$/ do
7
+ @writer.write_header(@parser.header)
8
+ end
9
+
10
+ When /^write all the parsed blocks$/ do
11
+ @writer.write_blocks(@parser.parse_blocks)
12
+ end
13
+
14
+ RSpec::Matchers.define :match_except_ws do |expected|
15
+ match do |actual|
16
+ system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
17
+ end
18
+
19
+ failure_message_for_should do |actual|
20
+ msg = "File contents did not match. Diff:\n"
21
+ msg << `diff --unified --ignore-space-change #{expected} #{actual}`
22
+ end
23
+ end
24
+
25
+ Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
26
+ @dst.path.should match_except_ws($test_data + ref)
27
+ end
@@ -0,0 +1,63 @@
1
+ When /^I open it with a MAF reader$/ do
2
+ @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
3
+ end
4
+
5
+ When /^I enable the :(\S+) parser option$/ do |opt_s|
6
+ @opts ||= {}
7
+ @opts[opt_s.to_sym] = true
8
+ end
9
+
10
+ Then /^the MAF version should be "(.*?)"$/ do |v_spec|
11
+ @parser.header.version.to_s.should == v_spec
12
+ end
13
+
14
+ Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
15
+ @parser.header.scoring.should == s_spec
16
+ end
17
+
18
+ Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
19
+ @parser.header.alignment_params.should == a_spec
20
+ end
21
+
22
+ Then /^an alignment block can be obtained$/ do
23
+ @block = @parser.parse_block
24
+ @block.should_not be_nil
25
+ end
26
+
27
+ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
28
+ @block.sequences.size.should == n_seq.to_i
29
+ end
30
+
31
+ Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
32
+ method_sym = method.gsub(/ /, '_').to_sym
33
+ @block.raw_seq(i.to_i).send(method_sym).should == str
34
+ end
35
+
36
+ Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
37
+ method_sym = method.gsub(/ /, '_').to_sym
38
+ @block.raw_seq(i.to_i).send(method_sym).should == num.to_i
39
+ end
40
+
41
+ Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
42
+ method_sym = method.gsub(/ /, '_').to_sym
43
+ value_sym = sym_s.to_sym
44
+ @block.raw_seq(i.to_i).send(method_sym).should == value_sym
45
+ end
46
+
47
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
48
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
49
+ method_sym = method.gsub(/ /, '_').to_sym
50
+ seq.send(method_sym).should == str
51
+ end
52
+
53
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
54
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
55
+ method_sym = method.gsub(/ /, '_').to_sym
56
+ seq.send(method_sym).should == num.to_i
57
+ end
58
+
59
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
60
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
61
+ method_sym = method.gsub(/ /, '_').to_sym
62
+ seq.send(method_sym).should == sym_s.to_sym
63
+ end
@@ -0,0 +1,31 @@
1
+ When /^filter for only the species$/ do |table|
2
+ # table is a Cucumber::Ast::Table
3
+ sp = table.raw.collect { |row| row[0] }
4
+ @parser.sequence_filter = { :only_species => sp }
5
+ end
6
+
7
+ When /^filter for blocks with the species$/ do |table|
8
+ # table is a Cucumber::Ast::Table
9
+ sp = table.raw.collect { |row| row[0] }
10
+ @block_filter = { :with_all_species => sp }
11
+ end
12
+
13
+ When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ @block_filter = { :at_least_n_sequences => n.to_i }
15
+ end
16
+
17
+ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ constraint = case op
19
+ when 'least' then :min_size
20
+ when 'most' then :max_size
21
+ else raise "bad operator #{op}!"
22
+ end
23
+ @block_filter = { constraint => len.to_i}
24
+ end
25
+
26
+ When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ @block_filter = {
28
+ :min_size => min.to_i,
29
+ :max_size => max.to_i
30
+ }
31
+ end
@@ -0,0 +1,14 @@
1
+ #require 'bio-ucsc-api'
2
+
3
+ Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
4
+ @r_start = r_start.to_i
5
+ @r_end = r_end.to_i
6
+ end
7
+
8
+ When /^I compute the smallest containing bin$/ do
9
+ @bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
10
+ end
11
+
12
+ Then /^the bin should be (\d+)$/ do |expected_bin|
13
+ @bin.should == expected_bin.to_i
14
+ end
@@ -0,0 +1,16 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'pathname'
10
+ require 'tempfile'
11
+
12
+ $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
13
+
14
+ require 'bio-maf'
15
+
16
+ $test_data = Pathname.new 'test/data'
@@ -0,0 +1,24 @@
1
+ Feature: Computation of UCSC bins
2
+ In order to efficiently use indexes
3
+ We will use the UCSC bin indexing system
4
+ Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
5
+
6
+ Scenario Outline: Compute smallest containing bin
7
+ Given I have a region with start <Start> and end <End>
8
+ When I compute the smallest containing bin
9
+ Then the bin should be <Bin>
10
+
11
+ Examples:
12
+ | Start | End | Bin |
13
+ | 25079603 | 25079787 | 776 |
14
+ | 25128173 | 25128248 | 776 |
15
+ | 50312474 | 50312703 | 968 |
16
+ | 41905591 | 41906101 | 904 |
17
+ | 16670899 | 16673060 | 712 |
18
+ | 75495356 | 75495494 | 1160 |
19
+ | 92259501 | 92261053 | 1288 |
20
+ | 83834063 | 83838132 | 1224 |
21
+ | 7309597 | 7310411 | 640 |
22
+ | 6190410 | 6190999 | 632 |
23
+ # from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
24
+
@@ -0,0 +1,12 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio/ucsc'
12
+ require 'bio/maf'
@@ -0,0 +1,3 @@
1
+ module BioMaf
2
+
3
+ end
@@ -0,0 +1,4 @@
1
+ require 'bio/maf/struct'
2
+ require 'bio/maf/index'
3
+ require 'bio/maf/parser'
4
+ require 'bio/maf/writer'
@@ -0,0 +1,620 @@
1
+ require 'kyotocabinet'
2
+ require 'jruby/profiler' if RUBY_PLATFORM == 'java'
3
+
4
+ #require 'bio-ucsc-api'
5
+ require 'bio-genomic-interval'
6
+
7
+ module Bio
8
+
9
+ module MAF
10
+
11
+ # Binary record packing and unpacking.
12
+ # @api private
13
+ module KVHelpers
14
+
15
+ KEY = Struct.new([[:marker, :uint8],
16
+ [:seq_id, :uint8],
17
+ [:bin, :uint16],
18
+ [:seq_start, :uint32],
19
+ [:seq_end, :uint32]])
20
+
21
+ VAL = Struct.new([[:offset, :uint64],
22
+ [:length, :uint32],
23
+ [:text_size, :uint32],
24
+ [:n_seq, :uint8],
25
+ [:species_vec, :uint64]])
26
+
27
+ KEY_FMT = KEY.fmt
28
+ KEY_SCAN_FMT = KEY.extractor_fmt(:seq_id, :bin, :seq_start, :seq_end)
29
+ CHROM_BIN_PREFIX_FMT = KEY.extractor_fmt(:marker, :seq_id, :bin)
30
+
31
+ VAL_FMT = VAL.fmt
32
+ VAL_IDX_OFFSET_FMT = VAL.extractor_fmt(:offset, :length)
33
+ VAL_TEXT_SIZE_FMT = VAL.extractor_fmt(:text_size)
34
+ VAL_N_SEQ_FMT = VAL.extractor_fmt(:n_seq)
35
+ VAL_SPECIES_FMT = VAL.extractor_fmt(:species_vec)
36
+
37
+ module_function
38
+
39
+ def extract_species_vec(entry)
40
+ entry[1].unpack(VAL_SPECIES_FMT)[0]
41
+ end
42
+
43
+ def extract_n_sequences(entry)
44
+ entry[1].unpack(VAL_N_SEQ_FMT)[0]
45
+ end
46
+
47
+ def extract_index_offset(entry)
48
+ entry[1].unpack(VAL_IDX_OFFSET_FMT)
49
+ end
50
+
51
+ def extract_text_size(entry)
52
+ entry[1].unpack(VAL_TEXT_SIZE_FMT)[0]
53
+ end
54
+
55
+ def unpack_key(ks)
56
+ ks.unpack(KEY_FMT)
57
+ end
58
+
59
+ def bin_start_prefix(chrom_id, bin)
60
+ [0xFF, chrom_id, bin].pack(CHROM_BIN_PREFIX_FMT)
61
+ end
62
+ end
63
+
64
+ class KyotoIndex
65
+ include KVHelpers
66
+
67
+ attr_reader :db, :species, :species_max_id
68
+ attr_accessor :index_sequences
69
+
70
+ FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
+ FORMAT_VERSION = 2
72
+ MAX_SPECIES = 64
73
+
74
+ ## Key-value store index format
75
+ ##
76
+ ## This format is designed for Kyoto Cabinet but should work on
77
+ ## other key-value databases allowing binary data.
78
+ ##
79
+ ## Index metadata is stored as ASCII text, but index data is
80
+ ## stored as packed binary values.
81
+ ##
82
+ ## Index metadata:
83
+ ##
84
+ ## Sequence IDs:
85
+ ## sequence:<name> => <id>
86
+ ##
87
+ ## Each indexed sequence has a corresponding entry of this
88
+ ## kind. The <name> parameter is the sequence or chromosome
89
+ ## name as found in the MAF file, e.g. mm8.chr7. The <id>
90
+ ## parameter is assigned when the sequence is indexed, and
91
+ ## can be from 0 to 255.
92
+ ##
93
+ ## Species IDs:
94
+ ## species:<name> => <id>
95
+ ##
96
+ ## Each indexed species has a corresponding entry of this
97
+ ## kind. The <name> parameter is the species part of the
98
+ ## sequence name as found in the MAF file, e.g. 'mm8' for
99
+ ## 'mm8.chr7'. The <id> parameter is assigned when the
100
+ ## species is indexed, and can be from 0 to 255.
101
+ ##
102
+ ## Index data:
103
+ ##
104
+ ## For each sequence upon which an index is built, one index
105
+ ## entry is generated per MAF alignment block. The key
106
+ ## identifies the sequence, the UCSC index bin, and the
107
+ ## zero-based start and end positions of the sequence. The
108
+ ## value gives the offset and size of the alignment block
109
+ ## within the MAF file.
110
+ ##
111
+ ## All values are stored as big-endian, unsigned packed binary
112
+ ## data.
113
+ ##
114
+ ## Keys: (12 bytes) [CCS>L>L>]
115
+ ##
116
+ ## 0xFF (1 byte):
117
+ ## index entry prefix
118
+ ## Sequence chromosome ID (1 byte):
119
+ ## corresponds to sequence:<name> entries
120
+ ## UCSC bin (16 bits)
121
+ ## Sequence start, zero-based, inclusive (32 bits)
122
+ ## Sequence end, zero-based, exclusive (32 bits)
123
+ ##
124
+ ## Values (25 bytes) [Q>L>L>CQ>]
125
+ ##
126
+ ## MAF file offset (64 bits)
127
+ ## MAF alignment block length (32 bits)
128
+ ## Block text size (32 bits)
129
+ ## Number of sequences in block (8 bits)
130
+ ## Species bit vector (64 bits)
131
+ ##
132
+ ## Example:
133
+ ##
134
+ ## For a block with sequence 0, bin 1195, start 80082334, end
135
+ ## 80082368, MAF offset 16, and MAF block length 1087:
136
+ ##
137
+ ## | |id| bin | seq_start | seq_end |
138
+ ## key: FF 00 04 AB 04 C5 F5 9E 04 C5 F5 C0
139
+ ##
140
+ ## | offset | length | ts |ns| species_vec |
141
+ ## val: 00 00 00 00 00 00 00 10 00 00 04 3F [TODO]
142
+
143
+ #### Public API
144
+
145
+ # Open an existing index for reading.
146
+ # @param [String] path path to existing Kyoto Cabinet index
147
+ # @return [KyotoIndex]
148
+ def self.open(path)
149
+ return KyotoIndex.new(path)
150
+ end
151
+
152
+ # Build a new index from the MAF file being parsed by `parser`,
153
+ # and store it in `path`.
154
+ # @param [Parser] parser MAF parser for file to index
155
+ # @param [String] path path to index file to create
156
+ # @return [KyotoIndex]
157
+ def self.build(parser, path)
158
+ idx = self.new(path)
159
+ idx.build_default(parser)
160
+ return idx
161
+ end
162
+
163
+ # Find all alignment blocks in the genomic regions in the list
164
+ # of Bio::GenomicInterval objects, and parse them with the given
165
+ # parser.
166
+ #
167
+ # An optional Hash of filters may be passed in. The following
168
+ # keys are used:
169
+ #
170
+ # * `:with_all_species => ["sp1", "sp2", ...]`
171
+ #
172
+ # Only match alignment blocks containing all given species.
173
+ #
174
+ # * `:at_least_n_sequences => n`
175
+ #
176
+ # Only match alignment blocks with at least N sequences.
177
+ #
178
+ # * `:min_size => n`
179
+ #
180
+ # Only match alignment blocks with text size at least N.
181
+ #
182
+ # * `:max_size => n`
183
+ #
184
+ # Only match alignment blocks with text size at most N.
185
+ #
186
+ # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
187
+ # intervals to parse.
188
+ # @param [Parser] parser MAF parser for file to fetch blocks
189
+ # from.
190
+ # @param [Hash] filter Block filter expression.
191
+ # @return [Array<Block>]
192
+ # @api public
193
+ def find(intervals, parser, filter={})
194
+ start = Time.now
195
+ fl = fetch_list(intervals, filter)
196
+ $stderr.printf("Built fetch list of %d items in %.3fs.\n",
197
+ fl.size,
198
+ Time.now - start)
199
+ parser.fetch_blocks(fl)
200
+ end
201
+
202
+ # Close the underlying Kyoto Cabinet database handle.
203
+ def close
204
+ db.close
205
+ end
206
+
207
+ #### KyotoIndex Internals
208
+ # @api private
209
+
210
+ def initialize(path, db_arg=nil)
211
+ @species = {}
212
+ @species_max_id = -1
213
+ if db_arg || ((path.size > 1) and File.exist?(path))
214
+ mode = KyotoCabinet::DB::OREADER
215
+ else
216
+ mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
217
+ end
218
+ @db = db_arg || KyotoCabinet::DB.new
219
+ @path = path
220
+ unless db_arg || db.open(path.to_s, mode)
221
+ raise "Could not open DB file!"
222
+ end
223
+ if mode == KyotoCabinet::DB::OREADER
224
+ load_index_sequences
225
+ load_species
226
+ end
227
+ end
228
+
229
+ # Reopen the same DB handle read-only. Only useful for unit tests.
230
+ def reopen
231
+ KyotoIndex.new(@path, @db)
232
+ end
233
+
234
+ def dump(stream=$stdout)
235
+ stream.puts "KyotoIndex dump: #{@path}"
236
+ stream.puts
237
+ if db.count == 0
238
+ stream.puts "Empty database!"
239
+ return
240
+ end
241
+ db.cursor_process do |cur|
242
+ stream.puts "== Metadata =="
243
+ cur.jump('')
244
+ while true
245
+ k, v = cur.get(false)
246
+ raise "unexpected end of records!" unless k
247
+ break if k[0] == "\xff"
248
+ stream.puts "#{k}: #{v}"
249
+ unless cur.step
250
+ raise "could not advance cursor!"
251
+ end
252
+ end
253
+ stream.puts "== Index records =="
254
+ while pair = cur.get(true)
255
+ _, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
256
+ offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
257
+ stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
258
+ stream.puts " offset #{offset}, length #{len}"
259
+ stream.puts " text size: #{text_size}"
260
+ stream.puts " sequences in block: #{n_seq}"
261
+ stream.printf(" species vector: %016x\n", species_vec)
262
+ end
263
+ end
264
+ end
265
+
266
+ ## Retrieval:
267
+ ## 1. merge the intervals of interest
268
+ ## 2. for each interval, compute the bins with #bin_all
269
+ ## 3. for each bin to search, make a list of intervals of
270
+ ## interest
271
+ ## 4. compute the spanning interval for that bin
272
+ ## 5. start at the beginning of the bin
273
+ ## 6. if a record intersects the spanning interval:
274
+ ## A. #find an interval it intersects
275
+ ## B. if found, add to the fetch list
276
+ ## 7. if a record starts past the end of the spanning interval,
277
+ ## we are done scanning this bin.
278
+ ##
279
+ ## Optimizations:
280
+ ## * once we reach the start of the spanning interval,
281
+ ## all records start in it until we see a record starting
282
+ ## past it.
283
+ ## * as record starts pass the start of intervals of interest,
284
+ ## pull those intervals off the list
285
+
286
+ # Build a fetch list of alignment blocks to read, given an array
287
+ # of Bio::GenomicInterval objects
288
+ def fetch_list(intervals, filter_spec={})
289
+ start = Time.now
290
+ filter_spec ||= {}
291
+ filters = Filters.build(filter_spec, self)
292
+ chrom = intervals.first.chrom
293
+ chrom_id = index_sequences[chrom]
294
+ unless chrom_id
295
+ raise "chromosome #{chrom} not indexed!"
296
+ end
297
+ if intervals.find { |i| i.chrom != chrom }
298
+ raise "all intervals must be for the same chromosome!"
299
+ end
300
+ # for each bin, build a list of the intervals to look for there
301
+ bin_intervals = Hash.new { |h, k| h[k] = [] }
302
+ intervals.each do |i|
303
+ i.bin_all.each do |bin|
304
+ bin_intervals[bin] << (i.zero_start...i.zero_end)
305
+ end
306
+ end
307
+ bin_intervals.values.each do |intervals|
308
+ intervals.sort_by! {|i| i.begin}
309
+ end
310
+ ready = Time.now
311
+ $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
+ if RUBY_PLATFORM == 'java'
313
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
314
+ else
315
+ scan_bins(chrom_id, bin_intervals, filters)
316
+ end
317
+ end # #fetch_list
318
+
319
+ # Scan the index for blocks matching the given bins and intervals.
320
+ def scan_bins(chrom_id, bin_intervals, filters)
321
+ to_fetch = []
322
+ db.cursor_process do |cur|
323
+ bin_intervals.each do |bin, bin_intervals_raw|
324
+ matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
325
+ to_fetch.concat(matches)
326
+ end
327
+ end
328
+ to_fetch
329
+ end
330
+
331
+ def with_profiling
332
+ if RUBY_PLATFORM == 'java' && ENV['profile']
333
+ rv = nil
334
+ pdata = JRuby::Profiler.profile do
335
+ rv = yield
336
+ end
337
+ printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
338
+ printer.printProfile(STDERR)
339
+ return rv
340
+ else
341
+ yield
342
+ end
343
+ end
344
+
345
+ def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
+ start = Time.now
347
+ n_threads = ENV['profile'] ? 1 : 4
348
+ jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
+ completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
+ threads = []
351
+ n_threads.times do
352
+ threads << make_scan_worker(jobs, completed) do |cur, req|
353
+ bin, intervals = req
354
+ scan_bin(cur, chrom_id, bin, intervals, filters)
355
+ end
356
+ end
357
+ n_completed = 0
358
+ to_fetch = []
359
+ while (n_completed < bin_intervals.size)
360
+ c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
361
+ if c.nil?
362
+ if threads.find { |t| t.alive? }
363
+ next
364
+ else
365
+ raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
366
+ end
367
+ end
368
+ raise "worker failed: #{c}" if c.is_a? Exception
369
+ to_fetch.concat(c)
370
+ n_completed += 1
371
+ end
372
+ threads.each { |t| t.join }
373
+ $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
374
+ to_fetch.size, n_threads, Time.now - start)
375
+ to_fetch
376
+ end
377
+
378
+ def make_scan_worker(jobs, completed)
379
+ Thread.new do
380
+ with_profiling do
381
+ db.cursor_process do |cur|
382
+ while true
383
+ req = jobs.poll
384
+ break unless req
385
+ begin
386
+ result = yield(cur, req)
387
+ completed.put(result)
388
+ rescue Exception => e
389
+ completed.put(e)
390
+ $stderr.puts "Worker failing: #{e.class}: #{e}"
391
+ $stderr.puts e.backtrace.join("\n")
392
+ raise e
393
+ end
394
+ end
395
+ end
396
+ end
397
+ end
398
+ end
399
+
400
+ def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
401
+ # bin_intervals is sorted by zero_start
402
+ # compute the start and end of all intervals of interest
403
+ spanning_start = bin_intervals.first.begin
404
+ spanning_end = bin_intervals.map {|i| i.end}.max
405
+ # scan from the start of the bin
406
+ cur.jump(bin_start_prefix(chrom_id, bin))
407
+ matches = []
408
+ while pair = cur.get(true)
409
+ c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
410
+ if (c_chr != chrom_id) \
411
+ || (c_bin != bin) \
412
+ || c_start >= spanning_end
413
+ # we've hit the next bin, or chromosome, or gone past
414
+ # the spanning interval, so we're done with this bin
415
+ break
416
+ end
417
+ if c_end >= spanning_start # possible overlap
418
+ # any intervals that end before the start of the current
419
+ # block are no longer relevant
420
+ while bin_intervals.first.end < c_start
421
+ bin_intervals.shift
422
+ end
423
+ bin_intervals.each do |i|
424
+ i_start = i.begin
425
+ break if i_start > c_end
426
+ if ((c_start <= i_start && i_start < c_end) \
427
+ || i.include?(c_start)) \
428
+ && filters.match(pair)
429
+ # match
430
+ matches << extract_index_offset(pair)
431
+ break
432
+ end
433
+ end
434
+ end
435
+ end
436
+ matches
437
+ end
438
+
439
+ def overlaps?(gi, i_start, i_end)
440
+ g_start = gi.begin
441
+
442
+ (i_start <= g_start && g_start < i_end) \
443
+ || gi.include?(i_start)
444
+ end
445
+
446
+ def build_default(parser)
447
+ first_block = parser.parse_block
448
+ ref_seq = first_block.sequences.first.source
449
+ db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
+ @index_sequences = { ref_seq => 0 }
451
+ store_index_sequences!
452
+ index_blocks([first_block])
453
+ parser.enum_for(:each_block).each_slice(1000).each do |blocks|
454
+ index_blocks(blocks)
455
+ end
456
+ db.synchronize(true)
457
+ end
458
+
459
+ def index_blocks(blocks)
460
+ h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
461
+ db.set_bulk(h, false)
462
+ end
463
+
464
+ def load_index_sequences
465
+ h = {}
466
+ db.match_prefix("sequence:").each do |key|
467
+ _, name = key.split(':', 2)
468
+ id = db[key].to_i
469
+ h[name] = id
470
+ end
471
+ @index_sequences = h
472
+ end
473
+
474
+ def store_index_sequences!
475
+ index_sequences.each do |name, id|
476
+ db.set("sequence:#{name}", id.to_s)
477
+ end
478
+ end
479
+
480
+ def load_species
481
+ db.match_prefix("species:").each do |key|
482
+ _, name = key.split(':', 2)
483
+ id = db[key].to_i
484
+ @species[name] = id
485
+ end
486
+ @species_max_id = @species.values.sort.last || -1
487
+ end
488
+
489
+ def species_id_for_seq(seq)
490
+ # NB can have multiple dots
491
+ # example: otoGar1.scaffold_104707.1-93001
492
+ parts = seq.split('.', 2)
493
+ if parts.size == 2
494
+ species_name = parts[0]
495
+ if species.has_key? species_name
496
+ return species[species_name]
497
+ else
498
+ species_id = @species_max_id + 1
499
+ if species_id >= MAX_SPECIES
500
+ raise "cannot index MAF file with more than #{MAX_SPECIES} species"
501
+ end
502
+ species[species_name] = species_id
503
+ db["species:#{species_name}"] = species_id
504
+ @species_max_id = species_id
505
+ return species_id
506
+ end
507
+ else
508
+ # not in species.sequence format, apparently
509
+ return nil
510
+ end
511
+ end
512
+
513
+ def build_block_value(block)
514
+ bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
515
+ vec = bits.reduce(0, :|)
516
+ return [block.offset,
517
+ block.size,
518
+ block.text_size,
519
+ block.sequences.size,
520
+ vec].pack(VAL_FMT)
521
+ end
522
+
523
+ def entries_for(block)
524
+ h = {}
525
+ val = build_block_value(block)
526
+ block.sequences.each do |seq|
527
+ seq_id = index_sequences[seq.source]
528
+ next unless seq_id
529
+ seq_end = seq.start + seq.size
530
+ bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
531
+ key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
532
+ h[key] = val
533
+ end
534
+ return h
535
+ end
536
+ end # class KyotoIndex
537
+
538
+ class Filter
539
+ include KVHelpers
540
+
541
+ def call(e)
542
+ match(e)
543
+ end
544
+ end
545
+
546
+ class AllSpeciesFilter < Filter
547
+ attr_reader :bs
548
+ def initialize(species, idx)
549
+ ids = species.collect {|s| 1 << idx.species.fetch(s) }
550
+ @mask = ids.reduce(0, :|)
551
+ end
552
+
553
+ def match(entry)
554
+ vec = extract_species_vec(entry)
555
+ (@mask & vec) == @mask
556
+ end
557
+ end
558
+
559
+ class AtLeastNSequencesFilter < Filter
560
+ attr_reader :n
561
+ def initialize(n, idx)
562
+ @n = n
563
+ end
564
+
565
+ def match(entry)
566
+ extract_n_sequences(entry) >= @n
567
+ end
568
+ end
569
+
570
+ class MaxSizeFilter < Filter
571
+ def initialize(n, idx)
572
+ @n = n
573
+ end
574
+ def match(entry)
575
+ extract_text_size(entry) <= @n
576
+ end
577
+ end
578
+
579
+ class MinSizeFilter < Filter
580
+ def initialize(n, idx)
581
+ @n = n
582
+ end
583
+ def match(entry)
584
+ extract_text_size(entry) >= @n
585
+ end
586
+ end
587
+
588
+ class Filters
589
+ include KVHelpers
590
+
591
+ FILTER_CLASSES = {
592
+ :with_all_species => MAF::AllSpeciesFilter,
593
+ :at_least_n_sequences => MAF::AtLeastNSequencesFilter,
594
+ :min_size => MAF::MinSizeFilter,
595
+ :max_size => MAF::MaxSizeFilter
596
+ }
597
+
598
+ def self.build(spec, idx)
599
+ l = spec.collect do |key, val|
600
+ if FILTER_CLASSES.has_key? key
601
+ FILTER_CLASSES[key].new(val, idx)
602
+ else
603
+ raise "Unsupported filter key #{key}!"
604
+ end
605
+ end
606
+ return Filters.new(l)
607
+ end
608
+
609
+ def initialize(l)
610
+ @l = l
611
+ end
612
+
613
+ def match(entry)
614
+ return ! @l.find { |f| ! f.call(entry) }
615
+ end
616
+ end
617
+
618
+ end # module MAF
619
+
620
+ end