bio-maf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,363 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe KyotoIndex do
|
7
|
+
def has_at_least_n_with_prefix(n, start)
|
8
|
+
@idx.db.cursor_process do |cur|
|
9
|
+
i = 0
|
10
|
+
cur.jump(start)
|
11
|
+
k = cur.get_key(true)
|
12
|
+
while k && k.start_with?(start) && i < n
|
13
|
+
i += 1
|
14
|
+
end
|
15
|
+
return i == n
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe ".build" do
|
20
|
+
it "accepts '%' as a path for an in-memory DB" do
|
21
|
+
expect {
|
22
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
23
|
+
@idx = KyotoIndex.build(@p, '%')
|
24
|
+
@p.f.close
|
25
|
+
@idx.close
|
26
|
+
}.not_to raise_error
|
27
|
+
end
|
28
|
+
it "accepts .kct paths"
|
29
|
+
it "rejects other paths"
|
30
|
+
context "mm8_chr7" do
|
31
|
+
before(:each) do
|
32
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
33
|
+
@idx = KyotoIndex.build(@p, '%')
|
34
|
+
end
|
35
|
+
it "uses the first sequence appearing as the reference sequence" do
|
36
|
+
@idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
|
37
|
+
end
|
38
|
+
it "creates 8 index entries" do
|
39
|
+
has_at_least_n_with_prefix(8, "\xFF\x00").should be_true
|
40
|
+
end
|
41
|
+
it "stores the sequence IDs" do
|
42
|
+
@idx.db.match_prefix("sequence:").size.should == 1
|
43
|
+
end
|
44
|
+
it "stores the sequence IDs" do
|
45
|
+
@idx.db.get("sequence:mm8.chr7").should == "0"
|
46
|
+
end
|
47
|
+
describe "loads sequence data correctly" do
|
48
|
+
before(:each) { @idx = @idx.reopen }
|
49
|
+
it "uses the first sequence appearing as the reference sequence" do
|
50
|
+
@idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
after(:each) do
|
54
|
+
@idx.db.close
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe ".open" do
|
60
|
+
it "opens an existing index successfully" do
|
61
|
+
@idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
|
62
|
+
@idx.db.count.should be > 8
|
63
|
+
end
|
64
|
+
it "populates #index_sequences" do
|
65
|
+
@idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
|
66
|
+
@idx.index_sequences.size.should be > 0
|
67
|
+
@idx.index_sequences['mm8.chr7'].should == 0
|
68
|
+
end
|
69
|
+
after(:each) do
|
70
|
+
@idx.db.close if @idx
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "#find" do
|
75
|
+
context "mm8_chr7" do
|
76
|
+
before(:each) do
|
77
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
78
|
+
@idx = KyotoIndex.build(@p, '%')
|
79
|
+
end
|
80
|
+
|
81
|
+
it "returns a block given a range contained in the block" do
|
82
|
+
l = @idx.find([GenomicInterval.zero_based('mm8.chr7',
|
83
|
+
80082334,
|
84
|
+
80082338)],
|
85
|
+
@p).to_a
|
86
|
+
l.size.should == 1
|
87
|
+
l[0].offset.should == 16
|
88
|
+
end
|
89
|
+
|
90
|
+
after(:each) do
|
91
|
+
@idx.db.close
|
92
|
+
@p.f.close
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
describe "#fetch_list" do
|
98
|
+
context "mm8_chr7" do
|
99
|
+
before(:each) do
|
100
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
101
|
+
@idx = KyotoIndex.build(@p, '%')
|
102
|
+
end
|
103
|
+
it "returns a block spec given a range contained in the block" do
|
104
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
105
|
+
80082334,
|
106
|
+
80082338)])
|
107
|
+
l.size.should == 1
|
108
|
+
l[0][0].should == 16 # block offset
|
109
|
+
end
|
110
|
+
it "returns a block spec with correct size" do
|
111
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
112
|
+
80082334,
|
113
|
+
80082338)])
|
114
|
+
l.size.should == 1
|
115
|
+
l[0][0].should == 16 # block offset
|
116
|
+
l[0][1].should == 1087 # block size
|
117
|
+
end
|
118
|
+
it "returns a block spec given its range exactly" do
|
119
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
120
|
+
80082334,
|
121
|
+
80082368)])
|
122
|
+
l.size.should == 1
|
123
|
+
l[0][0].should == 16 # block offset
|
124
|
+
end
|
125
|
+
it "returns specs for adjoining blocks given a range partially in each" do
|
126
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
127
|
+
80082360,
|
128
|
+
80082370)])
|
129
|
+
l.size.should == 2
|
130
|
+
l.collect { |e| e[0] }.should == [16, 1103]
|
131
|
+
end
|
132
|
+
it "returns a block spec given a range ending in it" do
|
133
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
134
|
+
80082330,
|
135
|
+
80082339)])
|
136
|
+
l.size.should == 1
|
137
|
+
l[0][0].should == 16 # block offset
|
138
|
+
end
|
139
|
+
it "returns no block spec given a zero-based range ending at a block start" do
|
140
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
141
|
+
80082330,
|
142
|
+
80082334)])
|
143
|
+
l.size.should == 0
|
144
|
+
end
|
145
|
+
it "returns a block spec given a range beginning in it" do
|
146
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
147
|
+
80083009,
|
148
|
+
80083220)])
|
149
|
+
l.size.should == 1
|
150
|
+
l[0][0].should == 10113 # block offset
|
151
|
+
end
|
152
|
+
it "returns no block spec given a range beginning at its end" do
|
153
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
154
|
+
80083156,
|
155
|
+
80083200)])
|
156
|
+
l.size.should == 0
|
157
|
+
end
|
158
|
+
it "returns specs for all blocks given a range fitting a larger bin" do
|
159
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
160
|
+
0,
|
161
|
+
80083200)])
|
162
|
+
l.size.should == 8
|
163
|
+
end
|
164
|
+
it "returns no blocks given a range outside" do
|
165
|
+
l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
|
166
|
+
80083200,
|
167
|
+
80083300)])
|
168
|
+
end
|
169
|
+
after(:each) do
|
170
|
+
if @idx
|
171
|
+
@idx.db.close
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe "#overlaps?" do
|
178
|
+
before(:each) do
|
179
|
+
@idx = KyotoIndex.new('%')
|
180
|
+
end
|
181
|
+
def check_overlap(x, y)
|
182
|
+
i = x[0]...x[1]
|
183
|
+
@idx.overlaps?(i, y[0], y[1])
|
184
|
+
end
|
185
|
+
it "handles equal intervals" do
|
186
|
+
check_overlap([0, 10],
|
187
|
+
[0, 10]).should be_true
|
188
|
+
end
|
189
|
+
it "handles X contains Y" do
|
190
|
+
check_overlap([0, 10],
|
191
|
+
[0, 9]).should be_true
|
192
|
+
check_overlap([0, 10],
|
193
|
+
[1, 9]).should be_true
|
194
|
+
check_overlap([0, 10],
|
195
|
+
[1, 10]).should be_true
|
196
|
+
end
|
197
|
+
it "handles Y contains X" do
|
198
|
+
check_overlap([0, 9],
|
199
|
+
[0, 10]).should be_true
|
200
|
+
check_overlap([1, 9],
|
201
|
+
[0, 10]).should be_true
|
202
|
+
check_overlap([1, 10],
|
203
|
+
[0, 10]).should be_true
|
204
|
+
end
|
205
|
+
it "handles partial overlap" do
|
206
|
+
check_overlap([0, 9],
|
207
|
+
[1, 10]).should be_true
|
208
|
+
check_overlap([1, 10],
|
209
|
+
[0, 9]).should be_true
|
210
|
+
end
|
211
|
+
it "handles end cases" do
|
212
|
+
check_overlap([0, 10],
|
213
|
+
[10, 15]).should be_false
|
214
|
+
check_overlap([10, 15],
|
215
|
+
[0, 10]).should be_false
|
216
|
+
end
|
217
|
+
it "handles separated intervals" do
|
218
|
+
check_overlap([0, 10], [15, 20]).should be_false
|
219
|
+
check_overlap([15, 20], [0, 10]).should be_false
|
220
|
+
end
|
221
|
+
after(:each) do
|
222
|
+
@idx.db.close
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
describe "#entries_for" do
|
227
|
+
before(:each) do
|
228
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
229
|
+
@block = @p.parse_block
|
230
|
+
@idx = KyotoIndex.new('%')
|
231
|
+
end
|
232
|
+
context "single ref seq" do
|
233
|
+
before(:each) do
|
234
|
+
@idx.index_sequences = { 'mm8.chr7' => 0 }
|
235
|
+
@e = @idx.entries_for(@block)
|
236
|
+
end
|
237
|
+
it "gives the correct key data" do
|
238
|
+
_, seq, bin, i_start, i_end = @e.keys.first.unpack("CCS>L>L>")
|
239
|
+
seq.should == 0
|
240
|
+
bin.should == 1195
|
241
|
+
i_start.should == 80082334
|
242
|
+
i_end.should == 80082368
|
243
|
+
end
|
244
|
+
it "gives the correct offset" do
|
245
|
+
b_offset, b_len = @e.values.first.unpack("Q>L>")
|
246
|
+
b_offset.should == 16
|
247
|
+
end
|
248
|
+
it "gives the correct length" do
|
249
|
+
b_offset, b_len = @e.values.first.unpack("Q>L>")
|
250
|
+
b_len.should == 1087
|
251
|
+
end
|
252
|
+
end
|
253
|
+
after(:each) do
|
254
|
+
@p.f.close
|
255
|
+
@idx.db.close
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
describe "#species" do
|
262
|
+
before(:each) do
|
263
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
264
|
+
@idx = KyotoIndex.build(@p, '%')
|
265
|
+
end
|
266
|
+
shared_examples "species" do
|
267
|
+
it "records the correct number of species" do
|
268
|
+
@idx.species.size.should == 11
|
269
|
+
end
|
270
|
+
it "sets species_max_id correctly" do
|
271
|
+
@idx.species_max_id.should == 10
|
272
|
+
end
|
273
|
+
end
|
274
|
+
describe "after building index" do
|
275
|
+
include_examples "species"
|
276
|
+
it "records species in order" do
|
277
|
+
@idx.db["species:mm8"].should == "0"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
describe "after loading index" do
|
281
|
+
before(:each) { @idx = @idx.reopen }
|
282
|
+
include_examples "species"
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
describe "Filter classes" do
|
287
|
+
before(:each) do
|
288
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
289
|
+
@idx = KyotoIndex.build(@p, '%')
|
290
|
+
end
|
291
|
+
|
292
|
+
describe AllSpeciesFilter do
|
293
|
+
def fake_entry_with(species_l)
|
294
|
+
ids = species_l.collect {|s| @idx.species.fetch(s)}
|
295
|
+
vec = ids.collect { |id| 1 << id }.reduce(0, :|)
|
296
|
+
return ['', [0, 0, 0, 0, vec].pack(KyotoIndex::VAL_FMT)]
|
297
|
+
end
|
298
|
+
|
299
|
+
context "with an empty set" do
|
300
|
+
before(:each) do
|
301
|
+
@filter = AllSpeciesFilter.new([], @idx)
|
302
|
+
end
|
303
|
+
it "matches anything" do
|
304
|
+
e = fake_entry_with(%w(mm8 rn4 oryCun1))
|
305
|
+
@filter.match(e).should be_true
|
306
|
+
end
|
307
|
+
end
|
308
|
+
context "with [mm8 rn4]" do
|
309
|
+
before(:each) do
|
310
|
+
@filter = AllSpeciesFilter.new(%w(mm8 rn4), @idx)
|
311
|
+
end
|
312
|
+
it "does not match an empty entry" do
|
313
|
+
e = fake_entry_with(%w())
|
314
|
+
KVHelpers.extract_species_vec(e).should == 0
|
315
|
+
@filter.bs.should_not == 0
|
316
|
+
@filter.match(e).should be_false
|
317
|
+
end
|
318
|
+
it "does not match an entry with mm8" do
|
319
|
+
e = fake_entry_with(%w(mm8))
|
320
|
+
@filter.match(e).should be_false
|
321
|
+
end
|
322
|
+
it "does not match an entry with mm8 oryCun1" do
|
323
|
+
e = fake_entry_with(%w(mm8 oryCun1))
|
324
|
+
@filter.match(e).should be_false
|
325
|
+
end
|
326
|
+
it "matches an entry with mm8 rn4" do
|
327
|
+
e = fake_entry_with(%w(mm8 rn4))
|
328
|
+
@filter.match(e).should be_true
|
329
|
+
end
|
330
|
+
it "does not match an entry with mm8 rn4 oryCun1" do
|
331
|
+
e = fake_entry_with(%w(mm8 rn4 oryCun1))
|
332
|
+
@filter.match(e).should be_true
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end # AllSpeciesFilter
|
336
|
+
|
337
|
+
describe AtLeastNSequencesFilter do
|
338
|
+
def fake_entry_with(n)
|
339
|
+
return ['', [0, 0, 0, n, 0].pack(KyotoIndex::VAL_FMT)]
|
340
|
+
end
|
341
|
+
context "n = 3" do
|
342
|
+
before(:each) do
|
343
|
+
@filter = AtLeastNSequencesFilter.new(3, @idx)
|
344
|
+
end
|
345
|
+
it "does not match 2 sequences" do
|
346
|
+
e = fake_entry_with(2)
|
347
|
+
@filter.match(e).should be_false
|
348
|
+
end
|
349
|
+
it "matches 3 sequences" do
|
350
|
+
e = fake_entry_with(3)
|
351
|
+
@filter.match(e).should be_true
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end # AtLeastNSequencesFilter
|
355
|
+
|
356
|
+
after(:each) do
|
357
|
+
@idx.close
|
358
|
+
end
|
359
|
+
end # filter classes
|
360
|
+
|
361
|
+
end # module MAF
|
362
|
+
|
363
|
+
end # module Bio
|