bio-maf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,354 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Header do
7
+ before(:each) do
8
+ @p = Parser.new(TestData + 't1.maf')
9
+ end
10
+
11
+ it "provides version information" do
12
+ @p.header.version.should == '1'
13
+ end
14
+ it "provides the scoring scheme" do
15
+ @p.header.scoring.should == 'humor.v4'
16
+ end
17
+ it "provides alignment parameters" do
18
+ @p.header.alignment_params.should =~ /humor.v4 R=30/
19
+ end
20
+
21
+ it "presents multiline parameters correctly" do
22
+ @p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
23
+ end
24
+
25
+ it "provides arbitrary parameters"
26
+ end
27
+
28
+ describe ParseContext do
29
+ it "tracks the last block position"
30
+ end
31
+
32
+ describe ChunkReader do
33
+ before(:each) do
34
+ @f = (TestData + 'mm8_chr7_tiny.maf').open
35
+ end
36
+ describe "#initialize" do
37
+ it "rejects a chunk size of zero" do
38
+ expect {
39
+ ChunkReader.new(@f, 0)
40
+ }.to raise_error(/Invalid chunk size/)
41
+ end
42
+ it "rejects a negative chunk size" do
43
+ expect {
44
+ ChunkReader.new(@f, 0)
45
+ }.to raise_error(/Invalid chunk size/)
46
+ end
47
+ it "rejects a chunk size not a power of 2" do
48
+ expect {
49
+ ChunkReader.new(@f, 1000)
50
+ }.to raise_error(/Invalid chunk size/)
51
+ end
52
+ it "accepts a 4k chunk size" do
53
+ expect {
54
+ ChunkReader.new(@f, 4096)
55
+ }.not_to raise_error
56
+ end
57
+ it "accepts an 8M chunk size" do
58
+ expect {
59
+ ChunkReader.new(@f, 8 * 1024 * 1024)
60
+ }.not_to raise_error
61
+ end
62
+ end
63
+ context "with 1K ChunkReader" do
64
+ before(:each) do
65
+ @r = ChunkReader.new(@f, 1024)
66
+ end
67
+
68
+ describe "#chunk_size=" do
69
+ it "sets the chunk size" do
70
+ @r.chunk_size = 8192
71
+ @r.chunk_size.should == 8192
72
+ end
73
+ # it "sets the chunk shift" do
74
+ # @r.chunk_size = 8192
75
+ # @r.chunk_shift.should == 13
76
+ # end
77
+ end
78
+
79
+ describe "#read_chunk" do
80
+ it "returns a chunk of the specified length" do
81
+ @r.read_chunk.bytesize == 1024
82
+ end
83
+ it "starts at position 0" do
84
+ @r.pos.should == 0
85
+ end
86
+ it "advances the position" do
87
+ @r.read_chunk
88
+ @r.pos.should == 1024
89
+ end
90
+ end
91
+
92
+ describe "#read_chunk_at" do
93
+ it "returns data starting at the specified offset" do
94
+ c = @r.read_chunk_at(59)
95
+ c.start_with?("80082334").should be_true
96
+ end
97
+ it "handles a read starting exactly at a chunk boundary" do
98
+ c = @r.read_chunk_at(1024)
99
+ c.start_with?(" 594").should be_true
100
+ end
101
+ end
102
+ end
103
+ after(:each) do
104
+ @f.close
105
+ end
106
+ end
107
+
108
+ shared_examples "parsers" do
109
+
110
+ describe "creation" do
111
+ it "opens a file specified as a String argument"
112
+ it "takes an IO object as an open file"
113
+ it "raises an error when the file does not exist" do
114
+ expect {
115
+ described_class.new("/doesnotexist")
116
+ }.to raise_error(Errno::ENOENT)
117
+ end
118
+ it "raises an error when the file is not in MAF format" do
119
+ expect {
120
+ described_class.new(TestData + '../../Rakefile')
121
+ }.to raise_error
122
+ end
123
+ end
124
+
125
+ describe "#header" do
126
+ it "parses the MAF header" do
127
+ p = described_class.new(TestData + 't1.maf')
128
+ p.header.should_not be_nil
129
+ end
130
+ end
131
+
132
+ describe "#fetch_blocks" do
133
+ shared_examples_for "any chunk size" do
134
+ it "parses a single block" do
135
+ fl = [[16, 1087]]
136
+ blocks = @p.fetch_blocks(fl).to_a
137
+ blocks.size.should == 1
138
+ blocks[0].offset.should == 16
139
+ end
140
+ it "parses several consecutive blocks" do
141
+ fl = [[16, 1087], [1103, 1908], [3011, 2027]]
142
+ blocks = @p.fetch_blocks(fl).to_a
143
+ blocks.size.should == 3
144
+ blocks.collect {|b| b.offset}.should == [16, 1103, 3011]
145
+ end
146
+ it "parses consecutive blocks further ahead" do
147
+ fl = [[5038, 1647], [6685, 829]]
148
+ blocks = @p.fetch_blocks(fl).to_a
149
+ blocks.size.should == 2
150
+ blocks.collect {|b| b.offset}.should == [5038, 6685]
151
+ end
152
+ it "parses nonconsecutive blocks" do
153
+ fl = [[16, 1087], [3011, 2027]]
154
+ blocks = @p.fetch_blocks(fl).to_a
155
+ blocks.size.should == 2
156
+ blocks.collect {|b| b.offset}.should == [16, 3011]
157
+ end
158
+ end
159
+ context "with 4K chunk size" do
160
+ before(:each) do
161
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
162
+ :chunk_size => 4096,
163
+ :random_chunk_size => 4096)
164
+ end
165
+ it_behaves_like "any chunk size"
166
+ end
167
+ context "with 1K chunk size" do
168
+ before(:each) do
169
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
170
+ :chunk_size => 1024,
171
+ :random_chunk_size => 1024)
172
+ end
173
+ it_behaves_like "any chunk size"
174
+ end
175
+ context "after parsing to end" do
176
+ before(:each) do
177
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
178
+ :chunk_size => 4096,
179
+ :random_chunk_size => 4096)
180
+ @p.each_block { |b| nil }
181
+ end
182
+ it_behaves_like "any chunk size"
183
+ end
184
+ context "with 8M chunk size" do
185
+ before(:each) do
186
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf',
187
+ :chunk_size => 8 * 1024 * 1024,
188
+ :random_chunk_size => 8 * 1024 * 1024)
189
+ end
190
+ it_behaves_like "any chunk size"
191
+ end
192
+ after(:each) do
193
+ @p.f.close
194
+ end
195
+ end
196
+
197
+ describe "sequence_filter" do
198
+ before(:each) do
199
+ @p = described_class.new(TestData + 'mm8_mod_a.maf')
200
+ end
201
+ it "restricts sequences parsed" do
202
+ @p.sequence_filter = { :only_species => %w(mm8 rn4) }
203
+ @p.parse_block.sequences.size.should == 2
204
+ end
205
+ it "matches at the species delimiter rather than a prefix" do
206
+ @p.sequence_filter = { :only_species => %w(mm8 hg18) }
207
+ @p.parse_block.sequences.size.should == 2
208
+ end
209
+ end
210
+
211
+ context "at end of file" do
212
+ describe "#parse_block" do
213
+ it "returns nil"
214
+ end
215
+ end
216
+
217
+ describe "#parse_block" do
218
+ it "returns an alignment block" do
219
+ p = described_class.new(TestData + 't1.maf')
220
+ b = p.parse_block()
221
+ b.should_not be_nil
222
+ end
223
+ it "raises an exception for malformed data"
224
+ end
225
+
226
+ it "gives the correct number of sequences" do
227
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
228
+ block = p.parse_block
229
+ block.sequences.size.should == 10
230
+ end
231
+
232
+ it "handles absent alignment parameters" do
233
+ p = described_class.new(TestData + 'chrY-1block.maf')
234
+ b = p.parse_block()
235
+ b.should_not be_nil
236
+ end
237
+
238
+ it "parses larger files" do
239
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
240
+ expect {
241
+ p.each_block { |block| block }
242
+ }.not_to raise_error
243
+ end
244
+
245
+ it "handles trailing comments" do
246
+ p = described_class.new(TestData + 't1a.maf')
247
+ expect {
248
+ p.each_block { |block| block }
249
+ }.not_to raise_error
250
+ end
251
+
252
+ it "raises an exception on inconsistent sequence length" do
253
+ pending
254
+ ## can't just do string length, have to skip over hyphens
255
+ end
256
+
257
+ it "tracks block start offsets correctly" do
258
+ pa = []
259
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
260
+ p.each_block { |b| pa << b.offset }
261
+ pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
262
+ end
263
+
264
+ it "reports block sizes correctly" do
265
+ p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
266
+ block = p.parse_block
267
+ block.size.should == 1087
268
+ end
269
+
270
+ it "parses very large blocks" do
271
+ p = described_class.new(TestData + 'big-block.maf')
272
+ n = 0
273
+ p.each_block { |b| n += 1 }
274
+ n.should == 490
275
+ end
276
+
277
+ end
278
+
279
+ describe Parser do
280
+ include_examples "parsers"
281
+
282
+ def with_const_value(mod, sym, value)
283
+ old = mod.const_get(sym)
284
+ mod.const_set(sym, value)
285
+ begin
286
+ yield
287
+ ensure
288
+ mod.const_set(sym, old)
289
+ end
290
+ end
291
+
292
+ describe "#merge_fetch_list" do
293
+ before(:each) do
294
+ @p = described_class.new(TestData + 'mm8_chr7_tiny.maf')
295
+ end
296
+ it "passes through single records" do
297
+ fl = [[16, 1087]]
298
+ @p.merge_fetch_list(fl).should == [[16, 1087, [16]]]
299
+ end
300
+ it "passes through non-contiguous records" do
301
+ fl = [[16, 1087], [3011, 2027]]
302
+ @p.merge_fetch_list(fl).should == [[16, 1087, [16]],
303
+ [3011, 2027, [3011]]]
304
+ end
305
+ it "merges contiguous records" do
306
+ fl = [[16, 1087], [1103, 1908], [3011, 2027]]
307
+ @p.merge_fetch_list(fl).should == [[16, 5022, [16, 1103, 3011]]]
308
+ end
309
+ after(:each) do
310
+ @p.f.close
311
+ end
312
+ end
313
+
314
+ it "sets last block position correctly" do
315
+ p = Parser.new(TestData + 'mm8_subset_a.maf')
316
+ p.last_block_pos.should == 1103
317
+ end
318
+
319
+ context "with 2k chunk size" do
320
+ before(:each) do
321
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf',
322
+ :chunk_size => 2048)
323
+ end
324
+ it "yields the correct number of blocks over chunk boundaries" do
325
+ ref_scores = %w(10542.0 -33148.0 87527.0 185399.0
326
+ 30120.0 58255.0 2607.0 8132.0)
327
+ scores = []
328
+ @p.each_block do |block|
329
+ scores << block.vars[:score]
330
+ end
331
+ scores.should == ref_scores
332
+ end
333
+ it "sets last_block_pos correctly" do
334
+ @p.last_block_pos.should == 1103
335
+ end
336
+ it "handles sequence lines over chunk boundaries" do
337
+ @p.parse_block
338
+ block = @p.parse_block
339
+ break_seq = block.raw_seq(4)
340
+ break_seq.text.size.should == 156
341
+ end
342
+
343
+ it "tracks block start offsets correctly over chunk boundaries" do
344
+ pa = []
345
+ @p.each_block { |b| pa << b.offset }
346
+ pa.should == [16, 1103, 3011, 5038, 6685, 7514, 9022, 10113]
347
+ end
348
+ end
349
+
350
+ end
351
+
352
+ end
353
+
354
+ end
@@ -0,0 +1,75 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Struct do
7
+
8
+ describe "#fmt" do
9
+ it "presents all members in order" do
10
+ @s = Struct.new([[:a, :uint16],
11
+ [:b, :uint32],
12
+ [:c, :uint32],
13
+ [:d, :uint8]])
14
+ @s.fmt.should == "S>L>L>C"
15
+ end
16
+ end
17
+
18
+ describe "#extractor_fmt" do
19
+ it "handles uint8" do
20
+ @s = Struct.new([[:marker, :uint8]])
21
+ @s.extractor_fmt(:marker).should == "C"
22
+ end
23
+ it "handles uint16" do
24
+ @s = Struct.new([[:a, :uint16]])
25
+ @s.extractor_fmt(:a).should == "S>"
26
+ end
27
+ it "handles uint32" do
28
+ @s = Struct.new([[:a, :uint32]])
29
+ @s.extractor_fmt(:a).should == "L>"
30
+ end
31
+ it "handles uint64" do
32
+ @s = Struct.new([[:a, :uint64]])
33
+ @s.extractor_fmt(:a).should == "Q>"
34
+ end
35
+ it "skips uint8" do
36
+ @s = Struct.new([[:dummy, :uint8],
37
+ [:a, :uint64]])
38
+ @s.extractor_fmt(:a).should == "@1Q>"
39
+ end
40
+ it "skips uint16" do
41
+ @s = Struct.new([[:dummy, :uint16],
42
+ [:a, :uint64]])
43
+ @s.extractor_fmt(:a).should == "@2Q>"
44
+ end
45
+ it "skips uint32" do
46
+ @s = Struct.new([[:dummy, :uint32],
47
+ [:a, :uint64]])
48
+ @s.extractor_fmt(:a).should == "@4Q>"
49
+ end
50
+ it "skips uint64" do
51
+ @s = Struct.new([[:dummy, :uint64],
52
+ [:a, :uint64]])
53
+ @s.extractor_fmt(:a).should == "@8Q>"
54
+ end
55
+ it "extracts multiple leading elements" do
56
+ @s = Struct.new([[:a, :uint16],
57
+ [:b, :uint32],
58
+ [:c, :uint32]])
59
+ @s.extractor_fmt(:a, :b).should == "S>L>"
60
+ end
61
+ it "extracts multiple offset elements" do
62
+ @s = Struct.new([[:a, :uint16],
63
+ [:b, :uint32],
64
+ [:c, :uint32],
65
+ [:d, :uint8]])
66
+ @s.extractor_fmt(:b, :c).should == "@2L>L>"
67
+ end
68
+ end
69
+
70
+ describe ""
71
+
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,14 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'rspec'
10
+ require 'pathname'
11
+
12
+ require 'bio-maf'
13
+
14
+ TestData = Pathname.new(__FILE__) + '../../test/data'