bio-maf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,354 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Header do
|
7
|
+
before(:each) do
|
8
|
+
@p = Parser.new(TestData + 't1.maf')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "provides version information" do
|
12
|
+
@p.header.version.should == '1'
|
13
|
+
end
|
14
|
+
it "provides the scoring scheme" do
|
15
|
+
@p.header.scoring.should == 'humor.v4'
|
16
|
+
end
|
17
|
+
it "provides alignment parameters" do
|
18
|
+
@p.header.alignment_params.should =~ /humor.v4 R=30/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "presents multiline parameters correctly" do
|
22
|
+
@p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "provides arbitrary parameters"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe ParseContext do
|
29
|
+
it "tracks the last block position"
|
30
|
+
end
|
31
|
+
|
32
|
+
describe ChunkReader do
|
33
|
+
before(:each) do
|
34
|
+
@f = (TestData + 'mm8_chr7_tiny.maf').open
|
35
|
+
end
|
36
|
+
describe "#initialize" do
|
37
|
+
it "rejects a chunk size of zero" do
|
38
|
+
expect {
|
39
|
+
ChunkReader.new(@f, 0)
|
40
|
+
}.to raise_error(/Invalid chunk size/)
|
41
|
+
end
|
42
|
+
it "rejects a negative chunk size" do
|
43
|
+
expect {
|
44
|
+
ChunkReader.new(@f, 0)
|
45
|
+
}.to raise_error(/Invalid chunk size/)
|
46
|
+
end
|
47
|
+
it "rejects a chunk size not a power of 2" do
|
48
|
+
expect {
|
49
|
+
ChunkReader.new(@f, 1000)
|
50
|
+
}.to raise_error(/Invalid chunk size/)
|
51
|
+
end
|
52
|
+
it "accepts a 4k chunk size" do
|
53
|
+
expect {
|
54
|
+
ChunkReader.new(@f, 4096)
|
55
|
+
}.not_to raise_error
|
56
|
+
end
|
57
|
+
it "accepts an 8M chunk size" do
|
58
|
+
expect {
|
59
|
+
ChunkReader.new(@f, 8 * 1024 * 1024)
|
60
|
+
}.not_to raise_error
|
61
|
+
end
|
62
|
+
end
|
63
|
+
context "with 1K ChunkReader" do
|
64
|
+
before(:each) do
|
65
|
+
@r = ChunkReader.new(@f, 1024)
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "#chunk_size=" do
|
69
|
+
it "sets the chunk size" do
|
70
|
+
@r.chunk_size = 8192
|
71
|
+
@r.chunk_size.should == 8192
|
72
|
+
end
|
73
|
+
# it "sets the chunk shift" do
|
74
|
+
# @r.chunk_size = 8192
|
75
|
+
# @r.chunk_shift.should == 13
|
76
|
+
# end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "#read_chunk" do
|
80
|
+
it "returns a chunk of the specified length" do
|
81
|
+
@r.read_chunk.bytesize == 1024
|
82
|
+
end
|
83
|
+
it "starts at position 0" do
|
84
|
+
@r.pos.should == 0
|
85
|
+
end
|
86
|
+
it "advances the position" do
|
87
|
+
@r.read_chunk
|
88
|
+
@r.pos.should == 1024
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "#read_chunk_at" do
|
93
|
+
it "returns data starting at the specified offset" do
|
94
|
+
c = @r.read_chunk_at(59)
|
95
|
+
c.start_with?("80082334").should be_true
|
96
|
+
end
|
97
|
+
it "handles a read starting exactly at a chunk boundary" do
|
98
|
+
c = @r.read_chunk_at(1024)
|
99
|
+
c.start_with?(" 594").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
after(:each) do
|
104
|
+
@f.close
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
shared_examples "parsers" do
|
109
|
+
|
110
|
+
describe "creation" do
|
111
|
+
it "opens a file specified as a String argument"
|
112
|
+
it "takes an IO object as an open file"
|
113
|
+
it "raises an error when the file does not exist" do
|
114
|
+
expect {
|
115
|
+
described_class.new("/doesnotexist")
|
116
|
+
}.to raise_error(Errno::ENOENT)
|
117
|
+
end
|
118
|
+
it "raises an error when the file is not in MAF format" do
|
119
|
+
expect {
|
120
|
+
described_class.new(TestData + '../../Rakefile')
|
121
|
+
}.to raise_error
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
describe "#header" do
|
126
|
+
it "parses the MAF header" do
|
127
|
+
p = described_class.new(TestData + 't1.maf')
|
128
|
+
p.header.should_not be_nil
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "#fetch_blocks" do
|
133
|
+
shared_examples_for "any chunk size" do
|
134
|
+
it "parses a single block" do
|
135
|
+
fl = [[16, 1087]]
|
136
|
+
blocks = @p.fetch_blocks(fl).to_a
|
137
|
+
blocks.size.should == 1
|
138
|
+
blocks[0].offset.should == 16
|
139
|
+
end
|
140
|
+
it "parses several consecutive blocks" do
|
141
|
+
fl = [[16, 1087], [1103, 1908], [3011, 2027]]
|
142
|
+
blocks = @p.fetch_blocks(fl).to_a
|
143
|
+
blocks.size.should == 3
|
144
|
+
blocks.collect {|b| b.offset}.should == [16, 1103, 3011]
|
145
|
+
end
|
146
|
+
it "parses consecutive blocks further ahead" do
|
147
|
+
fl = [[5038, 1647], [6685, 829]]
|
148
|
+
blocks = @p.fetch_blocks(fl).to_a
|
149
|
+
blocks.size.should == 2
|
150
|
+
blocks.collect {|b| b.offset}.should == [5038, 6685]
|
151
|
+
end
|
152
|
+
it "parses nonconsecutive blocks" do
|
153
|
+
fl = [[16, 1087], [3011, 2027]]
|
154
|
+
blocks = @p.fetch_blocks(fl).to_a
|
155
|
+
blocks.size.should == 2
|
156
|
+
blocks.collect {|b| b.offset}.should == [16, 3011]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
context "with 4K chunk size" do
|
160
|
+
before(:each) do
|
161
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
162
|
+
:chunk_size => 4096,
|
163
|
+
:random_chunk_size => 4096)
|
164
|
+
end
|
165
|
+
it_behaves_like "any chunk size"
|
166
|
+
end
|
167
|
+
context "with 1K chunk size" do
|
168
|
+
before(:each) do
|
169
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
170
|
+
:chunk_size => 1024,
|
171
|
+
:random_chunk_size => 1024)
|
172
|
+
end
|
173
|
+
it_behaves_like "any chunk size"
|
174
|
+
end
|
175
|
+
context "after parsing to end" do
|
176
|
+
before(:each) do
|
177
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
178
|
+
:chunk_size => 4096,
|
179
|
+
:random_chunk_size => 4096)
|
180
|
+
@p.each_block { |b| nil }
|
181
|
+
end
|
182
|
+
it_behaves_like "any chunk size"
|
183
|
+
end
|
184
|
+
context "with 8M chunk size" do
|
185
|
+
before(:each) do
|
186
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
|
187
|
+
:chunk_size => 8 * 1024 * 1024,
|
188
|
+
:random_chunk_size => 8 * 1024 * 1024)
|
189
|
+
end
|
190
|
+
it_behaves_like "any chunk size"
|
191
|
+
end
|
192
|
+
after(:each) do
|
193
|
+
@p.f.close
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
describe "sequence_filter" do
|
198
|
+
before(:each) do
|
199
|
+
@p = described_class.new(TestData + 'mm8_mod_a.maf')
|
200
|
+
end
|
201
|
+
it "restricts sequences parsed" do
|
202
|
+
@p.sequence_filter = { :only_species => %w(mm8 rn4) }
|
203
|
+
@p.parse_block.sequences.size.should == 2
|
204
|
+
end
|
205
|
+
it "matches at the species delimiter rather than a prefix" do
|
206
|
+
@p.sequence_filter = { :only_species => %w(mm8 hg18) }
|
207
|
+
@p.parse_block.sequences.size.should == 2
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
context "at end of file" do
|
212
|
+
describe "#parse_block" do
|
213
|
+
it "returns nil"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
describe "#parse_block" do
|
218
|
+
it "returns an alignment block" do
|
219
|
+
p = described_class.new(TestData + 't1.maf')
|
220
|
+
b = p.parse_block()
|
221
|
+
b.should_not be_nil
|
222
|
+
end
|
223
|
+
it "raises an exception for malformed data"
|
224
|
+
end
|
225
|
+
|
226
|
+
it "gives the correct number of sequences" do
|
227
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
228
|
+
block = p.parse_block
|
229
|
+
block.sequences.size.should == 10
|
230
|
+
end
|
231
|
+
|
232
|
+
it "handles absent alignment parameters" do
|
233
|
+
p = described_class.new(TestData + 'chrY-1block.maf')
|
234
|
+
b = p.parse_block()
|
235
|
+
b.should_not be_nil
|
236
|
+
end
|
237
|
+
|
238
|
+
it "parses larger files" do
|
239
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
240
|
+
expect {
|
241
|
+
p.each_block { |block| block }
|
242
|
+
}.not_to raise_error
|
243
|
+
end
|
244
|
+
|
245
|
+
it "handles trailing comments" do
|
246
|
+
p = described_class.new(TestData + 't1a.maf')
|
247
|
+
expect {
|
248
|
+
p.each_block { |block| block }
|
249
|
+
}.not_to raise_error
|
250
|
+
end
|
251
|
+
|
252
|
+
it "raises an exception on inconsistent sequence length" do
|
253
|
+
pending
|
254
|
+
## can't just do string length, have to skip over hyphens
|
255
|
+
end
|
256
|
+
|
257
|
+
it "tracks block start offsets correctly" do
|
258
|
+
pa = []
|
259
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
260
|
+
p.each_block { |b| pa << b.offset }
|
261
|
+
pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
|
262
|
+
end
|
263
|
+
|
264
|
+
it "reports block sizes correctly" do
|
265
|
+
p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
266
|
+
block = p.parse_block
|
267
|
+
block.size.should == 1087
|
268
|
+
end
|
269
|
+
|
270
|
+
it "parses very large blocks" do
|
271
|
+
p = described_class.new(TestData + 'big-block.maf')
|
272
|
+
n = 0
|
273
|
+
p.each_block { |b| n += 1 }
|
274
|
+
n.should == 490
|
275
|
+
end
|
276
|
+
|
277
|
+
end
|
278
|
+
|
279
|
+
describe Parser do
|
280
|
+
include_examples "parsers"
|
281
|
+
|
282
|
+
def with_const_value(mod, sym, value)
|
283
|
+
old = mod.const_get(sym)
|
284
|
+
mod.const_set(sym, value)
|
285
|
+
begin
|
286
|
+
yield
|
287
|
+
ensure
|
288
|
+
mod.const_set(sym, old)
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
describe "#merge_fetch_list" do
|
293
|
+
before(:each) do
|
294
|
+
@p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
|
295
|
+
end
|
296
|
+
it "passes through single records" do
|
297
|
+
fl = [[16, 1087]]
|
298
|
+
@p.merge_fetch_list(fl).should == [[16, 1087, [16]]]
|
299
|
+
end
|
300
|
+
it "passes through non-contiguous records" do
|
301
|
+
fl = [[16, 1087], [3011, 2027]]
|
302
|
+
@p.merge_fetch_list(fl).should == [[16, 1087, [16]],
|
303
|
+
[3011, 2027, [3011]]]
|
304
|
+
end
|
305
|
+
it "merges contiguous records" do
|
306
|
+
fl = [[16, 1087], [1103, 1908], [3011, 2027]]
|
307
|
+
@p.merge_fetch_list(fl).should == [[16, 5022, [16, 1103, 3011]]]
|
308
|
+
end
|
309
|
+
after(:each) do
|
310
|
+
@p.f.close
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
it "sets last block position correctly" do
|
315
|
+
p = Parser.new(TestData + 'mm8_subset_a.maf')
|
316
|
+
p.last_block_pos.should == 1103
|
317
|
+
end
|
318
|
+
|
319
|
+
context "with 2k chunk size" do
|
320
|
+
before(:each) do
|
321
|
+
@p = Parser.new(TestData + 'mm8_chr7_tiny.maf',
|
322
|
+
:chunk_size => 2048)
|
323
|
+
end
|
324
|
+
it "yields the correct number of blocks over chunk boundaries" do
|
325
|
+
ref_scores = %w(10542.0 -33148.0 87527.0 185399.0
|
326
|
+
30120.0 58255.0 2607.0 8132.0)
|
327
|
+
scores = []
|
328
|
+
@p.each_block do |block|
|
329
|
+
scores << block.vars[:score]
|
330
|
+
end
|
331
|
+
scores.should == ref_scores
|
332
|
+
end
|
333
|
+
it "sets last_block_pos correctly" do
|
334
|
+
@p.last_block_pos.should == 1103
|
335
|
+
end
|
336
|
+
it "handles sequence lines over chunk boundaries" do
|
337
|
+
@p.parse_block
|
338
|
+
block = @p.parse_block
|
339
|
+
break_seq = block.raw_seq(4)
|
340
|
+
break_seq.text.size.should == 156
|
341
|
+
end
|
342
|
+
|
343
|
+
it "tracks block start offsets correctly over chunk boundaries" do
|
344
|
+
pa = []
|
345
|
+
@p.each_block { |b| pa << b.offset }
|
346
|
+
pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
351
|
+
|
352
|
+
end
|
353
|
+
|
354
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Struct do
|
7
|
+
|
8
|
+
describe "#fmt" do
|
9
|
+
it "presents all members in order" do
|
10
|
+
@s = Struct.new([[:a, :uint16],
|
11
|
+
[:b, :uint32],
|
12
|
+
[:c, :uint32],
|
13
|
+
[:d, :uint8]])
|
14
|
+
@s.fmt.should == "S>L>L>C"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#extractor_fmt" do
|
19
|
+
it "handles uint8" do
|
20
|
+
@s = Struct.new([[:marker, :uint8]])
|
21
|
+
@s.extractor_fmt(:marker).should == "C"
|
22
|
+
end
|
23
|
+
it "handles uint16" do
|
24
|
+
@s = Struct.new([[:a, :uint16]])
|
25
|
+
@s.extractor_fmt(:a).should == "S>"
|
26
|
+
end
|
27
|
+
it "handles uint32" do
|
28
|
+
@s = Struct.new([[:a, :uint32]])
|
29
|
+
@s.extractor_fmt(:a).should == "L>"
|
30
|
+
end
|
31
|
+
it "handles uint64" do
|
32
|
+
@s = Struct.new([[:a, :uint64]])
|
33
|
+
@s.extractor_fmt(:a).should == "Q>"
|
34
|
+
end
|
35
|
+
it "skips uint8" do
|
36
|
+
@s = Struct.new([[:dummy, :uint8],
|
37
|
+
[:a, :uint64]])
|
38
|
+
@s.extractor_fmt(:a).should == "@1Q>"
|
39
|
+
end
|
40
|
+
it "skips uint16" do
|
41
|
+
@s = Struct.new([[:dummy, :uint16],
|
42
|
+
[:a, :uint64]])
|
43
|
+
@s.extractor_fmt(:a).should == "@2Q>"
|
44
|
+
end
|
45
|
+
it "skips uint32" do
|
46
|
+
@s = Struct.new([[:dummy, :uint32],
|
47
|
+
[:a, :uint64]])
|
48
|
+
@s.extractor_fmt(:a).should == "@4Q>"
|
49
|
+
end
|
50
|
+
it "skips uint64" do
|
51
|
+
@s = Struct.new([[:dummy, :uint64],
|
52
|
+
[:a, :uint64]])
|
53
|
+
@s.extractor_fmt(:a).should == "@8Q>"
|
54
|
+
end
|
55
|
+
it "extracts multiple leading elements" do
|
56
|
+
@s = Struct.new([[:a, :uint16],
|
57
|
+
[:b, :uint32],
|
58
|
+
[:c, :uint32]])
|
59
|
+
@s.extractor_fmt(:a, :b).should == "S>L>"
|
60
|
+
end
|
61
|
+
it "extracts multiple offset elements" do
|
62
|
+
@s = Struct.new([[:a, :uint16],
|
63
|
+
[:b, :uint32],
|
64
|
+
[:c, :uint32],
|
65
|
+
[:d, :uint8]])
|
66
|
+
@s.extractor_fmt(:b, :c).should == "@2L>L>"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe ""
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
|
2
|
+
begin
|
3
|
+
require 'simplecov'
|
4
|
+
rescue LoadError
|
5
|
+
$stderr.puts "WARNING: could not require 'simplecov': #{$!}"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'rspec'
|
10
|
+
require 'pathname'
|
11
|
+
|
12
|
+
require 'bio-maf'
|
13
|
+
|
14
|
+
TestData = Pathname.new(__FILE__) + '../../test/data'
|