bio-maf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,354 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Header do
|
7
|
+
before(:each) do
|
8
|
+
@p = Parser.new(TestData + 't1.maf')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "provides version information" do
|
12
|
+
@p.header.version.should == '1'
|
13
|
+
end
|
14
|
+
it "provides the scoring scheme" do
|
15
|
+
@p.header.scoring.should == 'humor.v4'
|
16
|
+
end
|
17
|
+
it "provides alignment parameters" do
|
18
|
+
@p.header.alignment_params.should =~ /humor.v4 R=30/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "presents multiline parameters correctly" do
|
22
|
+
@p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "provides arbitrary parameters"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe ParseContext do
|
29
|
+
it "tracks the last block position"
|
30
|
+
end
|
31
|
+
|
32
|
+
describe ChunkReader do
|
33
|
+
before(:each) do
|
34
|
+
@f = (TestData + 'mm8_chr7_tiny.maf').open
|
35
|
+
end
|
36
|
+
describe "#initialize" do
|
37
|
+
it "rejects a chunk size of zero" do
|
38
|
+
expect {
|
39
|
+
ChunkReader.new(@f, 0)
|
40
|
+
}.to raise_error(/Invalid chunk size/)
|
41
|
+
end
|
42
|
+
it "rejects a negative chunk size" do
|
43
|
+
expect {
|
44
|
+
ChunkReader.new(@f, 0)
|
45
|
+
}.to raise_error(/Invalid chunk size/)
|
46
|
+
end
|
47
|
+
it "rejects a chunk size not a power of 2" do
|
48
|
+
expect {
|
49
|
+
ChunkReader.new(@f, 1000)
|
50
|
+
}.to raise_error(/Invalid chunk size/)
|
51
|
+
end
|
52
|
+
it "accepts a 4k chunk size" do
|
53
|
+
expect {
|
54
|
+
ChunkReader.new(@f, 4096)
|
55
|
+
}.not_to raise_error
|
56
|
+
end
|
57
|
+
it "accepts an 8M chunk size" do
|
58
|
+
expect {
|
59
|
+
ChunkReader.new(@f, 8 * 1024 * 1024)
|
60
|
+
}.not_to raise_error
|
61
|
+
end
|
62
|
+
end
|
63
|
+
context "with 1K ChunkReader" do
|
64
|
+
before(:each) do
|
65
|
+
@r = ChunkReader.new(@f, 1024)
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "#chunk_size=" do
|
69
|
+
it "sets the chunk size" do
|
70
|
+
@r.chunk_size = 8192
|
71
|
+
@r.chunk_size.should == 8192
|
72
|
+
end
|
73
|
+
# it "sets the chunk shift" do
|
74
|
+
# @r.chunk_size = 8192
|
75
|
+
# @r.chunk_shift.should == 13
|
76
|
+
# end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "#read_chunk" do
|
80
|
+
it "returns a chunk of the specified length" do
|
81
|
+
@r.read_chunk.bytesize == 1024
|
82
|
+
end
|
83
|
+
it "starts at position 0" do
|
84
|
+
@r.pos.should == 0
|
85
|
+
end
|
86
|
+
it "advances the position" do
|
87
|
+
@r.read_chunk
|
88
|
+
@r.pos.should == 1024
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "#read_chunk_at" do
|
93
|
+
it "returns data starting at the specified offset" do
|
94
|
+
c = @r.read_chunk_at(59)
|
95
|
+
c.start_with?("80082334").should be_true
|
96
|
+
end
|
97
|
+
it "handles a read starting exactly at a chunk boundary" do
|
98
|
+
c = @r.read_chunk_at(1024)
|
99
|
+
c.start_with?(" 594").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
after(:each) do
|
104
|
+
@f.close
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
shared_examples "parsers" do
|
109
|
+
|
110
|
+
describe "creation" do
|
111
|
+
it "opens a file specified as a String argument"
|
112
|
+
it "takes an IO object as an open file"
|
113
|
+
it "raises an error when the file does not exist" do
|
114
|
+
expect {
|
115
|
+
described_class.new("/doesnotexist")
|
116
|
+
}.to raise_error(Errno::ENOENT)
|
117
|
+
end
|
118
|
+
it "raises an error when the file is not in MAF format" do
|
119
|
+
expect {
|
120
|
+
described_class.new(TestData + '../../Rakefile')
|
121
|
+
}.to raise_error
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
describe "#header" do
|
126
|
+
it "parses the MAF header" do
|
127
|
+
p = described_class.new(TestData + 't1.maf')
|
128
|
+
p.header.should_not be_nil
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "#fetch_blocks" do
|
133
|
+
shared_examples_for "any chunk size" do
|
134
|
+
it "parses a single block" do
|
135
|
+
fl = [[16, 1087]]
|
136
|
+
blocks = @p.fetch_blocks(fl).to_a
|
137
|
+
blocks.size.should == 1
|
138
|
+
blocks[0].offset.should == 16
|
139
|
+
end
|
140
|
+
it "parses several consecutive blocks" do
|
141
|
+
fl = [[16, 1087], [1103, 1908], [3011, 2027]]
|
142
|
+
blocks = @p.fetch_blocks(fl).to_a
|
143
|
+
blocks.size.should == 3
|
144
|
+
blocks.collect {|b| b.offset}.should == [16, 1103, 3011]
|
145
|
+
end
|
146
|
+
it "parses consecutive blocks further ahead" do
|
147
|
+
fl = [[5038, 1647], [6685, 829]]
|
148
|
+
blocks = @p.fetch_blocks(fl).to_a
|
149
|
+
blocks.size.should == 2
|
150
|
+
blocks.collect {|b| b.offset}.should == [5038, 6685]
|
151
|
+
end
|
152
|
+
it "parses nonconsecutive blocks" do
|
153
|
+
fl = [[16, 1087], [3011, 2027]]
|
154
|
+
blocks = @p.fetch_blocks(fl).to_a
|
155
|
+
blocks.size.should == 2
|
156
|
+
blocks.collect {|b| b.offset}.should == [16, 3011]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
context "with 4K chunk size" do
|
160
|
+
before(:each) do
|
161
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
162
|
+
:chunk_size => 4096,
|
163
|
+
:random_chunk_size => 4096)
|
164
|
+
end
|
165
|
+
it_behaves_like "any chunk size"
|
166
|
+
end
|
167
|
+
context "with 1K chunk size" do
|
168
|
+
before(:each) do
|
169
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
170
|
+
:chunk_size => 1024,
|
171
|
+
:random_chunk_size => 1024)
|
172
|
+
end
|
173
|
+
it_behaves_like "any chunk size"
|
174
|
+
end
|
175
|
+
context "after parsing to end" do
|
176
|
+
before(:each) do
|
177
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
178
|
+
:chunk_size => 4096,
|
179
|
+
:random_chunk_size => 4096)
|
180
|
+
@p.each_block { |b| nil }
|
181
|
+
end
|
182
|
+
it_behaves_like "any chunk size"
|
183
|
+
end
|
184
|
+
context "with 8M chunk size" do
|
185
|
+
before(:each) do
|
186
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
187
|
+
:chunk_size => 8 * 1024 * 1024,
|
188
|
+
:random_chunk_size => 8 * 1024 * 1024)
|
189
|
+
end
|
190
|
+
it_behaves_like "any chunk size"
|
191
|
+
end
|
192
|
+
after(:each) do
|
193
|
+
@p.f.close
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
describe "sequence_filter" do
|
198
|
+
before(:each) do
|
199
|
+
@p = described_class.new(TestData + 'mm8_mod_a.maf')
|
200
|
+
end
|
201
|
+
it "restricts sequences parsed" do
|
202
|
+
@p.sequence_filter = { :only_species => %w(mm8 rn4) }
|
203
|
+
@p.parse_block.sequences.size.should == 2
|
204
|
+
end
|
205
|
+
it "matches at the species delimiter rather than a prefix" do
|
206
|
+
@p.sequence_filter = { :only_species => %w(mm8 hg18) }
|
207
|
+
@p.parse_block.sequences.size.should == 2
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
context "at end of file" do
|
212
|
+
describe "#parse_block" do
|
213
|
+
it "returns nil"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
describe "#parse_block" do
|
218
|
+
it "returns an alignment block" do
|
219
|
+
p = described_class.new(TestData + 't1.maf')
|
220
|
+
b = p.parse_block()
|
221
|
+
b.should_not be_nil
|
222
|
+
end
|
223
|
+
it "raises an exception for malformed data"
|
224
|
+
end
|
225
|
+
|
226
|
+
it "gives the correct number of sequences" do
|
227
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
228
|
+
block = p.parse_block
|
229
|
+
block.sequences.size.should == 10
|
230
|
+
end
|
231
|
+
|
232
|
+
it "handles absent alignment parameters" do
|
233
|
+
p = described_class.new(TestData + 'chrY-1block.maf')
|
234
|
+
b = p.parse_block()
|
235
|
+
b.should_not be_nil
|
236
|
+
end
|
237
|
+
|
238
|
+
it "parses larger files" do
|
239
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
240
|
+
expect {
|
241
|
+
p.each_block { |block| block }
|
242
|
+
}.not_to raise_error
|
243
|
+
end
|
244
|
+
|
245
|
+
it "handles trailing comments" do
|
246
|
+
p = described_class.new(TestData + 't1a.maf')
|
247
|
+
expect {
|
248
|
+
p.each_block { |block| block }
|
249
|
+
}.not_to raise_error
|
250
|
+
end
|
251
|
+
|
252
|
+
it "raises an exception on inconsistent sequence length" do
|
253
|
+
pending
|
254
|
+
## can't just do string length, have to skip over hyphens
|
255
|
+
end
|
256
|
+
|
257
|
+
it "tracks block start offsets correctly" do
|
258
|
+
pa = []
|
259
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
260
|
+
p.each_block { |b| pa << b.offset }
|
261
|
+
pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
|
262
|
+
end
|
263
|
+
|
264
|
+
it "reports block sizes correctly" do
|
265
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
266
|
+
block = p.parse_block
|
267
|
+
block.size.should == 1087
|
268
|
+
end
|
269
|
+
|
270
|
+
it "parses very large blocks" do
|
271
|
+
p = described_class.new(TestData + 'big-block.maf')
|
272
|
+
n = 0
|
273
|
+
p.each_block { |b| n += 1 }
|
274
|
+
n.should == 490
|
275
|
+
end
|
276
|
+
|
277
|
+
end
|
278
|
+
|
279
|
+
describe Parser do
|
280
|
+
include_examples "parsers"
|
281
|
+
|
282
|
+
def with_const_value(mod, sym, value)
|
283
|
+
old = mod.const_get(sym)
|
284
|
+
mod.const_set(sym, value)
|
285
|
+
begin
|
286
|
+
yield
|
287
|
+
ensure
|
288
|
+
mod.const_set(sym, old)
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
describe "#merge_fetch_list" do
|
293
|
+
before(:each) do
|
294
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
295
|
+
end
|
296
|
+
it "passes through single records" do
|
297
|
+
fl = [[16, 1087]]
|
298
|
+
@p.merge_fetch_list(fl).should == [[16, 1087, [16]]]
|
299
|
+
end
|
300
|
+
it "passes through non-contiguous records" do
|
301
|
+
fl = [[16, 1087], [3011, 2027]]
|
302
|
+
@p.merge_fetch_list(fl).should == [[16, 1087, [16]],
|
303
|
+
[3011, 2027, [3011]]]
|
304
|
+
end
|
305
|
+
it "merges contiguous records" do
|
306
|
+
fl = [[16, 1087], [1103, 1908], [3011, 2027]]
|
307
|
+
@p.merge_fetch_list(fl).should == [[16, 5022, [16, 1103, 3011]]]
|
308
|
+
end
|
309
|
+
after(:each) do
|
310
|
+
@p.f.close
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
it "sets last block position correctly" do
|
315
|
+
p = Parser.new(TestData + 'mm8_subset_a.maf')
|
316
|
+
p.last_block_pos.should == 1103
|
317
|
+
end
|
318
|
+
|
319
|
+
context "with 2k chunk size" do
|
320
|
+
before(:each) do
|
321
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf',
|
322
|
+
:chunk_size => 2048)
|
323
|
+
end
|
324
|
+
it "yields the correct number of blocks over chunk boundaries" do
|
325
|
+
ref_scores = %w(10542.0 -33148.0 87527.0 185399.0
|
326
|
+
30120.0 58255.0 2607.0 8132.0)
|
327
|
+
scores = []
|
328
|
+
@p.each_block do |block|
|
329
|
+
scores << block.vars[:score]
|
330
|
+
end
|
331
|
+
scores.should == ref_scores
|
332
|
+
end
|
333
|
+
it "sets last_block_pos correctly" do
|
334
|
+
@p.last_block_pos.should == 1103
|
335
|
+
end
|
336
|
+
it "handles sequence lines over chunk boundaries" do
|
337
|
+
@p.parse_block
|
338
|
+
block = @p.parse_block
|
339
|
+
break_seq = block.raw_seq(4)
|
340
|
+
break_seq.text.size.should == 156
|
341
|
+
end
|
342
|
+
|
343
|
+
it "tracks block start offsets correctly over chunk boundaries" do
|
344
|
+
pa = []
|
345
|
+
@p.each_block { |b| pa << b.offset }
|
346
|
+
pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
351
|
+
|
352
|
+
end
|
353
|
+
|
354
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Struct do
|
7
|
+
|
8
|
+
describe "#fmt" do
|
9
|
+
it "presents all members in order" do
|
10
|
+
@s = Struct.new([[:a, :uint16],
|
11
|
+
[:b, :uint32],
|
12
|
+
[:c, :uint32],
|
13
|
+
[:d, :uint8]])
|
14
|
+
@s.fmt.should == "S>L>L>C"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#extractor_fmt" do
|
19
|
+
it "handles uint8" do
|
20
|
+
@s = Struct.new([[:marker, :uint8]])
|
21
|
+
@s.extractor_fmt(:marker).should == "C"
|
22
|
+
end
|
23
|
+
it "handles uint16" do
|
24
|
+
@s = Struct.new([[:a, :uint16]])
|
25
|
+
@s.extractor_fmt(:a).should == "S>"
|
26
|
+
end
|
27
|
+
it "handles uint32" do
|
28
|
+
@s = Struct.new([[:a, :uint32]])
|
29
|
+
@s.extractor_fmt(:a).should == "L>"
|
30
|
+
end
|
31
|
+
it "handles uint64" do
|
32
|
+
@s = Struct.new([[:a, :uint64]])
|
33
|
+
@s.extractor_fmt(:a).should == "Q>"
|
34
|
+
end
|
35
|
+
it "skips uint8" do
|
36
|
+
@s = Struct.new([[:dummy, :uint8],
|
37
|
+
[:a, :uint64]])
|
38
|
+
@s.extractor_fmt(:a).should == "@1Q>"
|
39
|
+
end
|
40
|
+
it "skips uint16" do
|
41
|
+
@s = Struct.new([[:dummy, :uint16],
|
42
|
+
[:a, :uint64]])
|
43
|
+
@s.extractor_fmt(:a).should == "@2Q>"
|
44
|
+
end
|
45
|
+
it "skips uint32" do
|
46
|
+
@s = Struct.new([[:dummy, :uint32],
|
47
|
+
[:a, :uint64]])
|
48
|
+
@s.extractor_fmt(:a).should == "@4Q>"
|
49
|
+
end
|
50
|
+
it "skips uint64" do
|
51
|
+
@s = Struct.new([[:dummy, :uint64],
|
52
|
+
[:a, :uint64]])
|
53
|
+
@s.extractor_fmt(:a).should == "@8Q>"
|
54
|
+
end
|
55
|
+
it "extracts multiple leading elements" do
|
56
|
+
@s = Struct.new([[:a, :uint16],
|
57
|
+
[:b, :uint32],
|
58
|
+
[:c, :uint32]])
|
59
|
+
@s.extractor_fmt(:a, :b).should == "S>L>"
|
60
|
+
end
|
61
|
+
it "extracts multiple offset elements" do
|
62
|
+
@s = Struct.new([[:a, :uint16],
|
63
|
+
[:b, :uint32],
|
64
|
+
[:c, :uint32],
|
65
|
+
[:d, :uint8]])
|
66
|
+
@s.extractor_fmt(:b, :c).should == "@2L>L>"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe ""
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
|
2
|
+
begin
|
3
|
+
require 'simplecov'
|
4
|
+
rescue LoadError
|
5
|
+
$stderr.puts "WARNING: could not require 'simplecov': #{$!}"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'rspec'
|
10
|
+
require 'pathname'
|
11
|
+
|
12
|
+
require 'bio-maf'
|
13
|
+
|
14
|
+
TestData = Pathname.new(__FILE__) + '../../test/data'
|