bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,354 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Header do
7
+ before(:each) do
8
+ @p = Parser.new(TestData + 't1.maf')
9
+ end
10
+
11
+ it "provides version information" do
12
+ @p.header.version.should == '1'
13
+ end
14
+ it "provides the scoring scheme" do
15
+ @p.header.scoring.should == 'humor.v4'
16
+ end
17
+ it "provides alignment parameters" do
18
+ @p.header.alignment_params.should =~ /humor.v4 R=30/
19
+ end
20
+
21
+ it "presents multiline parameters correctly" do
22
+ @p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
23
+ end
24
+
25
+ it "provides arbitrary parameters"
26
+ end
27
+
28
+ describe ParseContext do
29
+ it "tracks the last block position"
30
+ end
31
+
32
+ describe ChunkReader do
33
+ before(:each) do
34
+ @f = (TestData + 'mm8_chr7_tiny.maf').open
35
+ end
36
+ describe "#initialize" do
37
+ it "rejects a chunk size of zero" do
38
+ expect {
39
+ ChunkReader.new(@f, 0)
40
+ }.to raise_error(/Invalid chunk size/)
41
+ end
42
+ it "rejects a negative chunk size" do
43
+ expect {
44
+ ChunkReader.new(@f, 0)
45
+ }.to raise_error(/Invalid chunk size/)
46
+ end
47
+ it "rejects a chunk size not a power of 2" do
48
+ expect {
49
+ ChunkReader.new(@f, 1000)
50
+ }.to raise_error(/Invalid chunk size/)
51
+ end
52
+ it "accepts a 4k chunk size" do
53
+ expect {
54
+ ChunkReader.new(@f, 4096)
55
+ }.not_to raise_error
56
+ end
57
+ it "accepts an 8M chunk size" do
58
+ expect {
59
+ ChunkReader.new(@f, 8 * 1024 * 1024)
60
+ }.not_to raise_error
61
+ end
62
+ end
63
+ context "with 1K ChunkReader" do
64
+ before(:each) do
65
+ @r = ChunkReader.new(@f, 1024)
66
+ end
67
+
68
+ describe "#chunk_size=" do
69
+ it "sets the chunk size" do
70
+ @r.chunk_size = 8192
71
+ @r.chunk_size.should == 8192
72
+ end
73
+ # it "sets the chunk shift" do
74
+ # @r.chunk_size = 8192
75
+ # @r.chunk_shift.should == 13
76
+ # end
77
+ end
78
+
79
+ describe "#read_chunk" do
80
+ it "returns a chunk of the specified length" do
81
+ @r.read_chunk.bytesize == 1024
82
+ end
83
+ it "starts at position 0" do
84
+ @r.pos.should == 0
85
+ end
86
+ it "advances the position" do
87
+ @r.read_chunk
88
+ @r.pos.should == 1024
89
+ end
90
+ end
91
+
92
+ describe "#read_chunk_at" do
93
+ it "returns data starting at the specified offset" do
94
+ c = @r.read_chunk_at(59)
95
+ c.start_with?("80082334").should be_true
96
+ end
97
+ it "handles a read starting exactly at a chunk boundary" do
98
+ c = @r.read_chunk_at(1024)
99
+ c.start_with?(" 594").should be_true
100
+ end
101
+ end
102
+ end
103
+ after(:each) do
104
+ @f.close
105
+ end
106
+ end
107
+
108
+ shared_examples "parsers" do
109
+
110
+ describe "creation" do
111
+ it "opens a file specified as a String argument"
112
+ it "takes an IO object as an open file"
113
+ it "raises an error when the file does not exist" do
114
+ expect {
115
+ described_class.new("/doesnotexist")
116
+ }.to raise_error(Errno::ENOENT)
117
+ end
118
+ it "raises an error when the file is not in MAF format" do
119
+ expect {
120
+ described_class.new(TestData + '../../Rakefile')
121
+ }.to raise_error
122
+ end
123
+ end
124
+
125
+ describe "#header" do
126
+ it "parses the MAF header" do
127
+ p = described_class.new(TestData + 't1.maf')
128
+ p.header.should_not be_nil
129
+ end
130
+ end
131
+
132
+ describe "#fetch_blocks" do
133
+ shared_examples_for "any chunk size" do
134
+ it "parses a single block" do
135
+ fl = [[16, 1087]]
136
+ blocks = @p.fetch_blocks(fl).to_a
137
+ blocks.size.should == 1
138
+ blocks[0].offset.should == 16
139
+ end
140
+ it "parses several consecutive blocks" do
141
+ fl = [[16, 1087], [1103, 1908], [3011, 2027]]
142
+ blocks = @p.fetch_blocks(fl).to_a
143
+ blocks.size.should == 3
144
+ blocks.collect {|b| b.offset}.should == [16, 1103, 3011]
145
+ end
146
+ it "parses consecutive blocks further ahead" do
147
+ fl = [[5038, 1647], [6685, 829]]
148
+ blocks = @p.fetch_blocks(fl).to_a
149
+ blocks.size.should == 2
150
+ blocks.collect {|b| b.offset}.should == [5038, 6685]
151
+ end
152
+ it "parses nonconsecutive blocks" do
153
+ fl = [[16, 1087], [3011, 2027]]
154
+ blocks = @p.fetch_blocks(fl).to_a
155
+ blocks.size.should == 2
156
+ blocks.collect {|b| b.offset}.should == [16, 3011]
157
+ end
158
+ end
159
+ context "with 4K chunk size" do
160
+ before(:each) do
161
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
162
+ :chunk_size => 4096,
163
+ :random_chunk_size => 4096)
164
+ end
165
+ it_behaves_like "any chunk size"
166
+ end
167
+ context "with 1K chunk size" do
168
+ before(:each) do
169
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
170
+ :chunk_size => 1024,
171
+ :random_chunk_size => 1024)
172
+ end
173
+ it_behaves_like "any chunk size"
174
+ end
175
+ context "after parsing to end" do
176
+ before(:each) do
177
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
178
+ :chunk_size => 4096,
179
+ :random_chunk_size => 4096)
180
+ @p.each_block { |b| nil }
181
+ end
182
+ it_behaves_like "any chunk size"
183
+ end
184
+ context "with 8M chunk size" do
185
+ before(:each) do
186
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
187
+ :chunk_size => 8 * 1024 * 1024,
188
+ :random_chunk_size => 8 * 1024 * 1024)
189
+ end
190
+ it_behaves_like "any chunk size"
191
+ end
192
+ after(:each) do
193
+ @p.f.close
194
+ end
195
+ end
196
+
197
+ describe "sequence_filter" do
198
+ before(:each) do
199
+ @p = described_class.new(TestData + 'mm8_mod_a.maf')
200
+ end
201
+ it "restricts sequences parsed" do
202
+ @p.sequence_filter = { :only_species => %w(mm8 rn4) }
203
+ @p.parse_block.sequences.size.should == 2
204
+ end
205
+ it "matches at the species delimiter rather than a prefix" do
206
+ @p.sequence_filter = { :only_species => %w(mm8 hg18) }
207
+ @p.parse_block.sequences.size.should == 2
208
+ end
209
+ end
210
+
211
+ context "at end of file" do
212
+ describe "#parse_block" do
213
+ it "returns nil"
214
+ end
215
+ end
216
+
217
+ describe "#parse_block" do
218
+ it "returns an alignment block" do
219
+ p = described_class.new(TestData + 't1.maf')
220
+ b = p.parse_block()
221
+ b.should_not be_nil
222
+ end
223
+ it "raises an exception for malformed data"
224
+ end
225
+
226
+ it "gives the correct number of sequences" do
227
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
228
+ block = p.parse_block
229
+ block.sequences.size.should == 10
230
+ end
231
+
232
+ it "handles absent alignment parameters" do
233
+ p = described_class.new(TestData + 'chrY-1block.maf')
234
+ b = p.parse_block()
235
+ b.should_not be_nil
236
+ end
237
+
238
+ it "parses larger files" do
239
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
240
+ expect {
241
+ p.each_block { |block| block }
242
+ }.not_to raise_error
243
+ end
244
+
245
+ it "handles trailing comments" do
246
+ p = described_class.new(TestData + 't1a.maf')
247
+ expect {
248
+ p.each_block { |block| block }
249
+ }.not_to raise_error
250
+ end
251
+
252
+ it "raises an exception on inconsistent sequence length" do
253
+ pending
254
+ ## can't just do string length, have to skip over hyphens
255
+ end
256
+
257
+ it "tracks block start offsets correctly" do
258
+ pa = []
259
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
260
+ p.each_block { |b| pa << b.offset }
261
+ pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
262
+ end
263
+
264
+ it "reports block sizes correctly" do
265
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
266
+ block = p.parse_block
267
+ block.size.should == 1087
268
+ end
269
+
270
+ it "parses very large blocks" do
271
+ p = described_class.new(TestData + 'big-block.maf')
272
+ n = 0
273
+ p.each_block { |b| n += 1 }
274
+ n.should == 490
275
+ end
276
+
277
+ end
278
+
279
+ describe Parser do
280
+ include_examples "parsers"
281
+
282
+ def with_const_value(mod, sym, value)
283
+ old = mod.const_get(sym)
284
+ mod.const_set(sym, value)
285
+ begin
286
+ yield
287
+ ensure
288
+ mod.const_set(sym, old)
289
+ end
290
+ end
291
+
292
+ describe "#merge_fetch_list" do
293
+ before(:each) do
294
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
295
+ end
296
+ it "passes through single records" do
297
+ fl = [[16, 1087]]
298
+ @p.merge_fetch_list(fl).should == [[16, 1087, [16]]]
299
+ end
300
+ it "passes through non-contiguous records" do
301
+ fl = [[16, 1087], [3011, 2027]]
302
+ @p.merge_fetch_list(fl).should == [[16, 1087, [16]],
303
+ [3011, 2027, [3011]]]
304
+ end
305
+ it "merges contiguous records" do
306
+ fl = [[16, 1087], [1103, 1908], [3011, 2027]]
307
+ @p.merge_fetch_list(fl).should == [[16, 5022, [16, 1103, 3011]]]
308
+ end
309
+ after(:each) do
310
+ @p.f.close
311
+ end
312
+ end
313
+
314
+ it "sets last block position correctly" do
315
+ p = Parser.new(TestData + 'mm8_subset_a.maf')
316
+ p.last_block_pos.should == 1103
317
+ end
318
+
319
+ context "with 2k chunk size" do
320
+ before(:each) do
321
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf',
322
+ :chunk_size => 2048)
323
+ end
324
+ it "yields the correct number of blocks over chunk boundaries" do
325
+ ref_scores = %w(10542.0 -33148.0 87527.0 185399.0
326
+ 30120.0 58255.0 2607.0 8132.0)
327
+ scores = []
328
+ @p.each_block do |block|
329
+ scores << block.vars[:score]
330
+ end
331
+ scores.should == ref_scores
332
+ end
333
+ it "sets last_block_pos correctly" do
334
+ @p.last_block_pos.should == 1103
335
+ end
336
+ it "handles sequence lines over chunk boundaries" do
337
+ @p.parse_block
338
+ block = @p.parse_block
339
+ break_seq = block.raw_seq(4)
340
+ break_seq.text.size.should == 156
341
+ end
342
+
343
+ it "tracks block start offsets correctly over chunk boundaries" do
344
+ pa = []
345
+ @p.each_block { |b| pa << b.offset }
346
+ pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
347
+ end
348
+ end
349
+
350
+ end
351
+
352
+ end
353
+
354
+ end
@@ -0,0 +1,75 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Struct do
7
+
8
+ describe "#fmt" do
9
+ it "presents all members in order" do
10
+ @s = Struct.new([[:a, :uint16],
11
+ [:b, :uint32],
12
+ [:c, :uint32],
13
+ [:d, :uint8]])
14
+ @s.fmt.should == "S>L>L>C"
15
+ end
16
+ end
17
+
18
+ describe "#extractor_fmt" do
19
+ it "handles uint8" do
20
+ @s = Struct.new([[:marker, :uint8]])
21
+ @s.extractor_fmt(:marker).should == "C"
22
+ end
23
+ it "handles uint16" do
24
+ @s = Struct.new([[:a, :uint16]])
25
+ @s.extractor_fmt(:a).should == "S>"
26
+ end
27
+ it "handles uint32" do
28
+ @s = Struct.new([[:a, :uint32]])
29
+ @s.extractor_fmt(:a).should == "L>"
30
+ end
31
+ it "handles uint64" do
32
+ @s = Struct.new([[:a, :uint64]])
33
+ @s.extractor_fmt(:a).should == "Q>"
34
+ end
35
+ it "skips uint8" do
36
+ @s = Struct.new([[:dummy, :uint8],
37
+ [:a, :uint64]])
38
+ @s.extractor_fmt(:a).should == "@1Q>"
39
+ end
40
+ it "skips uint16" do
41
+ @s = Struct.new([[:dummy, :uint16],
42
+ [:a, :uint64]])
43
+ @s.extractor_fmt(:a).should == "@2Q>"
44
+ end
45
+ it "skips uint32" do
46
+ @s = Struct.new([[:dummy, :uint32],
47
+ [:a, :uint64]])
48
+ @s.extractor_fmt(:a).should == "@4Q>"
49
+ end
50
+ it "skips uint64" do
51
+ @s = Struct.new([[:dummy, :uint64],
52
+ [:a, :uint64]])
53
+ @s.extractor_fmt(:a).should == "@8Q>"
54
+ end
55
+ it "extracts multiple leading elements" do
56
+ @s = Struct.new([[:a, :uint16],
57
+ [:b, :uint32],
58
+ [:c, :uint32]])
59
+ @s.extractor_fmt(:a, :b).should == "S>L>"
60
+ end
61
+ it "extracts multiple offset elements" do
62
+ @s = Struct.new([[:a, :uint16],
63
+ [:b, :uint32],
64
+ [:c, :uint32],
65
+ [:d, :uint8]])
66
+ @s.extractor_fmt(:b, :c).should == "@2L>L>"
67
+ end
68
+ end
69
+
70
+ describe ""
71
+
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,14 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'rspec'
10
+ require 'pathname'
11
+
12
+ require 'bio-maf'
13
+
14
+ TestData = Pathname.new(__FILE__) + '../../test/data'