bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,363 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe KyotoIndex do
7
+ def has_at_least_n_with_prefix(n, start)
8
+ @idx.db.cursor_process do |cur|
9
+ i = 0
10
+ cur.jump(start)
11
+ k = cur.get_key(true)
12
+ while k && k.start_with?(start) && i < n
13
+ i += 1
14
+ end
15
+ return i == n
16
+ end
17
+ end
18
+
19
+ describe ".build" do
20
+ it "accepts '%' as a path for an in-memory DB" do
21
+ expect {
22
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
23
+ @idx = KyotoIndex.build(@p, '%')
24
+ @p.f.close
25
+ @idx.close
26
+ }.not_to raise_error
27
+ end
28
+ it "accepts .kct paths"
29
+ it "rejects other paths"
30
+ context "mm8_chr7" do
31
+ before(:each) do
32
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
33
+ @idx = KyotoIndex.build(@p, '%')
34
+ end
35
+ it "uses the first sequence appearing as the reference sequence" do
36
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
37
+ end
38
+ it "creates 8 index entries" do
39
+ has_at_least_n_with_prefix(8, "\xFF\x00").should be_true
40
+ end
41
+ it "stores the sequence IDs" do
42
+ @idx.db.match_prefix("sequence:").size.should == 1
43
+ end
44
+ it "stores the sequence IDs" do
45
+ @idx.db.get("sequence:mm8.chr7").should == "0"
46
+ end
47
+ describe "loads sequence data correctly" do
48
+ before(:each) { @idx = @idx.reopen }
49
+ it "uses the first sequence appearing as the reference sequence" do
50
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
51
+ end
52
+ end
53
+ after(:each) do
54
+ @idx.db.close
55
+ end
56
+ end
57
+ end
58
+
59
+ describe ".open" do
60
+ it "opens an existing index successfully" do
61
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
62
+ @idx.db.count.should be > 8
63
+ end
64
+ it "populates #index_sequences" do
65
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
66
+ @idx.index_sequences.size.should be > 0
67
+ @idx.index_sequences['mm8.chr7'].should == 0
68
+ end
69
+ after(:each) do
70
+ @idx.db.close if @idx
71
+ end
72
+ end
73
+
74
+ describe "#find" do
75
+ context "mm8_chr7" do
76
+ before(:each) do
77
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
78
+ @idx = KyotoIndex.build(@p, '%')
79
+ end
80
+
81
+ it "returns a block given a range contained in the block" do
82
+ l = @idx.find([GenomicInterval.zero_based('mm8.chr7',
83
+ 80082334,
84
+ 80082338)],
85
+ @p).to_a
86
+ l.size.should == 1
87
+ l[0].offset.should == 16
88
+ end
89
+
90
+ after(:each) do
91
+ @idx.db.close
92
+ @p.f.close
93
+ end
94
+ end
95
+ end
96
+
97
+ describe "#fetch_list" do
98
+ context "mm8_chr7" do
99
+ before(:each) do
100
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
101
+ @idx = KyotoIndex.build(@p, '%')
102
+ end
103
+ it "returns a block spec given a range contained in the block" do
104
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
105
+ 80082334,
106
+ 80082338)])
107
+ l.size.should == 1
108
+ l[0][0].should == 16 # block offset
109
+ end
110
+ it "returns a block spec with correct size" do
111
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
112
+ 80082334,
113
+ 80082338)])
114
+ l.size.should == 1
115
+ l[0][0].should == 16 # block offset
116
+ l[0][1].should == 1087 # block size
117
+ end
118
+ it "returns a block spec given its range exactly" do
119
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
120
+ 80082334,
121
+ 80082368)])
122
+ l.size.should == 1
123
+ l[0][0].should == 16 # block offset
124
+ end
125
+ it "returns specs for adjoining blocks given a range partially in each" do
126
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
127
+ 80082360,
128
+ 80082370)])
129
+ l.size.should == 2
130
+ l.collect { |e| e[0] }.should == [16, 1103]
131
+ end
132
+ it "returns a block spec given a range ending in it" do
133
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
134
+ 80082330,
135
+ 80082339)])
136
+ l.size.should == 1
137
+ l[0][0].should == 16 # block offset
138
+ end
139
+ it "returns no block spec given a zero-based range ending at a block start" do
140
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
141
+ 80082330,
142
+ 80082334)])
143
+ l.size.should == 0
144
+ end
145
+ it "returns a block spec given a range beginning in it" do
146
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
147
+ 80083009,
148
+ 80083220)])
149
+ l.size.should == 1
150
+ l[0][0].should == 10113 # block offset
151
+ end
152
+ it "returns no block spec given a range beginning at its end" do
153
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
154
+ 80083156,
155
+ 80083200)])
156
+ l.size.should == 0
157
+ end
158
+ it "returns specs for all blocks given a range fitting a larger bin" do
159
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
160
+ 0,
161
+ 80083200)])
162
+ l.size.should == 8
163
+ end
164
+ it "returns no blocks given a range outside" do
165
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
166
+ 80083200,
167
+ 80083300)])
168
+ end
169
+ after(:each) do
170
+ if @idx
171
+ @idx.db.close
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ describe "#overlaps?" do
178
+ before(:each) do
179
+ @idx = KyotoIndex.new('%')
180
+ end
181
+ def check_overlap(x, y)
182
+ i = x[0]...x[1]
183
+ @idx.overlaps?(i, y[0], y[1])
184
+ end
185
+ it "handles equal intervals" do
186
+ check_overlap([0, 10],
187
+ [0, 10]).should be_true
188
+ end
189
+ it "handles X contains Y" do
190
+ check_overlap([0, 10],
191
+ [0, 9]).should be_true
192
+ check_overlap([0, 10],
193
+ [1, 9]).should be_true
194
+ check_overlap([0, 10],
195
+ [1, 10]).should be_true
196
+ end
197
+ it "handles Y contains X" do
198
+ check_overlap([0, 9],
199
+ [0, 10]).should be_true
200
+ check_overlap([1, 9],
201
+ [0, 10]).should be_true
202
+ check_overlap([1, 10],
203
+ [0, 10]).should be_true
204
+ end
205
+ it "handles partial overlap" do
206
+ check_overlap([0, 9],
207
+ [1, 10]).should be_true
208
+ check_overlap([1, 10],
209
+ [0, 9]).should be_true
210
+ end
211
+ it "handles end cases" do
212
+ check_overlap([0, 10],
213
+ [10, 15]).should be_false
214
+ check_overlap([10, 15],
215
+ [0, 10]).should be_false
216
+ end
217
+ it "handles separated intervals" do
218
+ check_overlap([0, 10], [15, 20]).should be_false
219
+ check_overlap([15, 20], [0, 10]).should be_false
220
+ end
221
+ after(:each) do
222
+ @idx.db.close
223
+ end
224
+ end
225
+
226
+ describe "#entries_for" do
227
+ before(:each) do
228
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
229
+ @block = @p.parse_block
230
+ @idx = KyotoIndex.new('%')
231
+ end
232
+ context "single ref seq" do
233
+ before(:each) do
234
+ @idx.index_sequences = { 'mm8.chr7' => 0 }
235
+ @e = @idx.entries_for(@block)
236
+ end
237
+ it "gives the correct key data" do
238
+ _, seq, bin, i_start, i_end = @e.keys.first.unpack("CCS>L>L>")
239
+ seq.should == 0
240
+ bin.should == 1195
241
+ i_start.should == 80082334
242
+ i_end.should == 80082368
243
+ end
244
+ it "gives the correct offset" do
245
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
246
+ b_offset.should == 16
247
+ end
248
+ it "gives the correct length" do
249
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
250
+ b_len.should == 1087
251
+ end
252
+ end
253
+ after(:each) do
254
+ @p.f.close
255
+ @idx.db.close
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ describe "#species" do
262
+ before(:each) do
263
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
264
+ @idx = KyotoIndex.build(@p, '%')
265
+ end
266
+ shared_examples "species" do
267
+ it "records the correct number of species" do
268
+ @idx.species.size.should == 11
269
+ end
270
+ it "sets species_max_id correctly" do
271
+ @idx.species_max_id.should == 10
272
+ end
273
+ end
274
+ describe "after building index" do
275
+ include_examples "species"
276
+ it "records species in order" do
277
+ @idx.db["species:mm8"].should == "0"
278
+ end
279
+ end
280
+ describe "after loading index" do
281
+ before(:each) { @idx = @idx.reopen }
282
+ include_examples "species"
283
+ end
284
+ end
285
+
286
+ describe "Filter classes" do
287
+ before(:each) do
288
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
289
+ @idx = KyotoIndex.build(@p, '%')
290
+ end
291
+
292
+ describe AllSpeciesFilter do
293
+ def fake_entry_with(species_l)
294
+ ids = species_l.collect {|s| @idx.species.fetch(s)}
295
+ vec = ids.collect { |id| 1 << id }.reduce(0, :|)
296
+ return ['', [0, 0, 0, 0, vec].pack(KyotoIndex::VAL_FMT)]
297
+ end
298
+
299
+ context "with an empty set" do
300
+ before(:each) do
301
+ @filter = AllSpeciesFilter.new([], @idx)
302
+ end
303
+ it "matches anything" do
304
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
305
+ @filter.match(e).should be_true
306
+ end
307
+ end
308
+ context "with [mm8 rn4]" do
309
+ before(:each) do
310
+ @filter = AllSpeciesFilter.new(%w(mm8 rn4), @idx)
311
+ end
312
+ it "does not match an empty entry" do
313
+ e = fake_entry_with(%w())
314
+ KVHelpers.extract_species_vec(e).should == 0
315
+ @filter.bs.should_not == 0
316
+ @filter.match(e).should be_false
317
+ end
318
+ it "does not match an entry with mm8" do
319
+ e = fake_entry_with(%w(mm8))
320
+ @filter.match(e).should be_false
321
+ end
322
+ it "does not match an entry with mm8 oryCun1" do
323
+ e = fake_entry_with(%w(mm8 oryCun1))
324
+ @filter.match(e).should be_false
325
+ end
326
+ it "matches an entry with mm8 rn4" do
327
+ e = fake_entry_with(%w(mm8 rn4))
328
+ @filter.match(e).should be_true
329
+ end
330
+ it "does not match an entry with mm8 rn4 oryCun1" do
331
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
332
+ @filter.match(e).should be_true
333
+ end
334
+ end
335
+ end # AllSpeciesFilter
336
+
337
+ describe AtLeastNSequencesFilter do
338
+ def fake_entry_with(n)
339
+ return ['', [0, 0, 0, n, 0].pack(KyotoIndex::VAL_FMT)]
340
+ end
341
+ context "n = 3" do
342
+ before(:each) do
343
+ @filter = AtLeastNSequencesFilter.new(3, @idx)
344
+ end
345
+ it "does not match 2 sequences" do
346
+ e = fake_entry_with(2)
347
+ @filter.match(e).should be_false
348
+ end
349
+ it "matches 3 sequences" do
350
+ e = fake_entry_with(3)
351
+ @filter.match(e).should be_true
352
+ end
353
+ end
354
+ end # AtLeastNSequencesFilter
355
+
356
+ after(:each) do
357
+ @idx.close
358
+ end
359
+ end # filter classes
360
+
361
+ end # module MAF
362
+
363
+ end # module Bio