bio-maf 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,83 @@
1
+ maf_index(1) -- build and examine MAF indexes
2
+ =============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_index` [-t] <maf> <index><br>
7
+ `maf_index` `-d`|`--dump` <index>
8
+
9
+ ## DESCRIPTION
10
+
11
+ **maf_index** is part of the bioruby-maf library and creates
12
+ Kyoto Cabinet indexes for Multiple Alignment Format (MAF)
13
+ files. These indexes enable other MAF tools to selectively extract
14
+ alignment blocks of interest.
15
+
16
+ In its default mode, `maf_index` parses the <maf> file given as an
17
+ argument and creates an index in <index>.
18
+
19
+ The index data is stored in binary form, so with the `--dump`
20
+ argument, `maf_index` can dump out the index data in human-readable
21
+ form for debugging.
22
+
23
+ ## FILES
24
+
25
+ The <maf> input file must be a valid MAF file of any length.
26
+
27
+ The index created is a Kyoto Cabinet TreeDB (B+ tree) database;
28
+ <index> must have a `.kct` extension.
29
+
30
+ ## OPTIONS
31
+
32
+ TODO
33
+
34
+ * `-d`, `--dump`:
35
+ Instead of creating an index, dump out the given <index> in
36
+ human-readable form. Index records will appear like:
37
+
38
+ 0 [bin 1195] 80082334:80082368
39
+ offset 16, length 1087
40
+ text size: 54
41
+ sequences in block: 10
42
+ species vector: 00000000000003ff
43
+
44
+ * `-t`, `--threaded`:
45
+ Use a separate reader thread to do I/O in parallel with
46
+ parsing. Only useful on JRuby.
47
+
48
+ * `--time`:
49
+ Print elapsed time for index creation. Mainly useful for measuring
50
+ performance with different Ruby implementations, I/O subsystems,
51
+ etc.
52
+
53
+ ## EXAMPLES
54
+
55
+ Build an index on a MAF file:
56
+
57
+ $ maf_index chr22.maf chr22.kct
58
+
59
+ Dump out an index:
60
+
61
+ $ maf_index -d chr22.kct > /tmp/chr22.dump
62
+
63
+ ## ENVIRONMENT
64
+
65
+ `maf_index` is a Ruby program and relies on ordinary Ruby environment
66
+ variables.
67
+
68
+ ## BUGS
69
+
70
+ `maf_index` does not currently allow Kyoto Cabinet database parameters
71
+ to be set.
72
+
73
+ ## COPYRIGHT
74
+
75
+ `maf_index` is copyright (C) 2012 Clayton Wheeler.
76
+
77
+ ## SEE ALSO
78
+
79
+ ruby(1), kctreemgr(1)
80
+
81
+ * <https://github.com/csw/bioruby-maf/>
82
+ * <http://fallabs.com/kyotocabinet/>
83
+
@@ -0,0 +1,53 @@
1
+ .\" generated with Ronn/v0.7.3
2
+ .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
+ .
4
+ .TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
5
+ .
6
+ .SH "NAME"
7
+ \fBmaf_to_fasta\fR \- convert MAF file to FASTA
8
+ .
9
+ .SH "SYNOPSIS"
10
+ \fBmaf_to_fasta\fR [\fIoptions\fR\.\.\.] \fImaf\fR \fIfasta\fR
11
+ .
12
+ .SH "DESCRIPTION"
13
+ \fBmaf_to_fasta\fR, part of the bioruby\-maf library, converts Multiple Alignment Format (MAF) files to FASTA format\. It does not attempt to combine alignment blocks, but simply writes out each sequence in order\.
14
+ .
15
+ .SH "FILES"
16
+ The \fImaf\fR input file must be a valid MAF file of any length\.
17
+ .
18
+ .P
19
+ The \fIfasta\fR output file will be written in FASTA format\.
20
+ .
21
+ .SH "OPTIONS"
22
+ The options are only useful for performance measurement\.
23
+ .
24
+ .SH "EXAMPLES"
25
+ Convert a MAF file to FASTA:
26
+ .
27
+ .IP "" 4
28
+ .
29
+ .nf
30
+
31
+ $ maf_to_fasta chrY\.maf chrY\.fa
32
+ .
33
+ .fi
34
+ .
35
+ .IP "" 0
36
+ .
37
+ .SH "ENVIRONMENT"
38
+ \fBmaf_to_fasta\fR is a Ruby program and relies on ordinary Ruby environment variables\.
39
+ .
40
+ .SH "BUGS"
41
+ \fBmaf_to_fasta\fR should provide flexibility in selecting the alignment blocks and sequences to convert, and perhaps allow alignment blocks to be spliced together\.
42
+ .
43
+ .SH "COPYRIGHT"
44
+ \fBmaf_to_fasta\fR is copyright (C) 2012 Clayton Wheeler\.
45
+ .
46
+ .SH "SEE ALSO"
47
+ ruby(1)
48
+ .
49
+ .IP "\(bu" 4
50
+ \fIhttps://github\.com/csw/bioruby\-maf/\fR
51
+ .
52
+ .IP "" 0
53
+
@@ -0,0 +1,51 @@
1
+ maf_to_fasta(1) -- convert MAF file to FASTA
2
+ ============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_to_fasta` [<options>...] <maf> <fasta>
7
+
8
+ ## DESCRIPTION
9
+
10
+ **maf_to_fasta**, part of the bioruby-maf library, converts Multiple
11
+ Alignment Format (MAF) files to FASTA format. It does not attempt to
12
+ combine alignment blocks, but simply writes out each sequence in
13
+ order.
14
+
15
+ ## FILES
16
+
17
+ The <maf> input file must be a valid MAF file of any length.
18
+
19
+ The <fasta> output file will be written in FASTA format.
20
+
21
+ ## OPTIONS
22
+
23
+ The options are only useful for performance measurement.
24
+
25
+ ## EXAMPLES
26
+
27
+ Convert a MAF file to FASTA:
28
+
29
+ $ maf_to_fasta chrY.maf chrY.fa
30
+
31
+ ## ENVIRONMENT
32
+
33
+ `maf_to_fasta` is a Ruby program and relies on ordinary Ruby
34
+ environment variables.
35
+
36
+ ## BUGS
37
+
38
+ `maf_to_fasta` should provide flexibility in selecting the alignment
39
+ blocks and sequences to convert, and perhaps allow alignment blocks to
40
+ be spliced together.
41
+
42
+ ## COPYRIGHT
43
+
44
+ `maf_to_fasta` is copyright (C) 2012 Clayton Wheeler.
45
+
46
+ ## SEE ALSO
47
+
48
+ ruby(1)
49
+
50
+ * <https://github.com/csw/bioruby-maf/>
51
+
@@ -0,0 +1,363 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe KyotoIndex do
7
+ def has_at_least_n_with_prefix(n, start)
8
+ @idx.db.cursor_process do |cur|
9
+ i = 0
10
+ cur.jump(start)
11
+ k = cur.get_key(true)
12
+ while k && k.start_with?(start) && i < n
13
+ i += 1
14
+ end
15
+ return i == n
16
+ end
17
+ end
18
+
19
+ describe ".build" do
20
+ it "accepts '%' as a path for an in-memory DB" do
21
+ expect {
22
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
23
+ @idx = KyotoIndex.build(@p, '%')
24
+ @p.f.close
25
+ @idx.close
26
+ }.not_to raise_error
27
+ end
28
+ it "accepts .kct paths"
29
+ it "rejects other paths"
30
+ context "mm8_chr7" do
31
+ before(:each) do
32
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
33
+ @idx = KyotoIndex.build(@p, '%')
34
+ end
35
+ it "uses the first sequence appearing as the reference sequence" do
36
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
37
+ end
38
+ it "creates 8 index entries" do
39
+ has_at_least_n_with_prefix(8, "\xFF\x00").should be_true
40
+ end
41
+ it "stores the sequence IDs" do
42
+ @idx.db.match_prefix("sequence:").size.should == 1
43
+ end
44
+ it "stores the sequence IDs" do
45
+ @idx.db.get("sequence:mm8.chr7").should == "0"
46
+ end
47
+ describe "loads sequence data correctly" do
48
+ before(:each) { @idx = @idx.reopen }
49
+ it "uses the first sequence appearing as the reference sequence" do
50
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
51
+ end
52
+ end
53
+ after(:each) do
54
+ @idx.db.close
55
+ end
56
+ end
57
+ end
58
+
59
+ describe ".open" do
60
+ it "opens an existing index successfully" do
61
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
62
+ @idx.db.count.should be > 8
63
+ end
64
+ it "populates #index_sequences" do
65
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
66
+ @idx.index_sequences.size.should be > 0
67
+ @idx.index_sequences['mm8.chr7'].should == 0
68
+ end
69
+ after(:each) do
70
+ @idx.db.close if @idx
71
+ end
72
+ end
73
+
74
+ describe "#find" do
75
+ context "mm8_chr7" do
76
+ before(:each) do
77
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
78
+ @idx = KyotoIndex.build(@p, '%')
79
+ end
80
+
81
+ it "returns a block given a range contained in the block" do
82
+ l = @idx.find([GenomicInterval.zero_based('mm8.chr7',
83
+ 80082334,
84
+ 80082338)],
85
+ @p).to_a
86
+ l.size.should == 1
87
+ l[0].offset.should == 16
88
+ end
89
+
90
+ after(:each) do
91
+ @idx.db.close
92
+ @p.f.close
93
+ end
94
+ end
95
+ end
96
+
97
+ describe "#fetch_list" do
98
+ context "mm8_chr7" do
99
+ before(:each) do
100
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
101
+ @idx = KyotoIndex.build(@p, '%')
102
+ end
103
+ it "returns a block spec given a range contained in the block" do
104
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
105
+ 80082334,
106
+ 80082338)])
107
+ l.size.should == 1
108
+ l[0][0].should == 16 # block offset
109
+ end
110
+ it "returns a block spec with correct size" do
111
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
112
+ 80082334,
113
+ 80082338)])
114
+ l.size.should == 1
115
+ l[0][0].should == 16 # block offset
116
+ l[0][1].should == 1087 # block size
117
+ end
118
+ it "returns a block spec given its range exactly" do
119
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
120
+ 80082334,
121
+ 80082368)])
122
+ l.size.should == 1
123
+ l[0][0].should == 16 # block offset
124
+ end
125
+ it "returns specs for adjoining blocks given a range partially in each" do
126
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
127
+ 80082360,
128
+ 80082370)])
129
+ l.size.should == 2
130
+ l.collect { |e| e[0] }.should == [16, 1103]
131
+ end
132
+ it "returns a block spec given a range ending in it" do
133
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
134
+ 80082330,
135
+ 80082339)])
136
+ l.size.should == 1
137
+ l[0][0].should == 16 # block offset
138
+ end
139
+ it "returns no block spec given a zero-based range ending at a block start" do
140
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
141
+ 80082330,
142
+ 80082334)])
143
+ l.size.should == 0
144
+ end
145
+ it "returns a block spec given a range beginning in it" do
146
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
147
+ 80083009,
148
+ 80083220)])
149
+ l.size.should == 1
150
+ l[0][0].should == 10113 # block offset
151
+ end
152
+ it "returns no block spec given a range beginning at its end" do
153
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
154
+ 80083156,
155
+ 80083200)])
156
+ l.size.should == 0
157
+ end
158
+ it "returns specs for all blocks given a range fitting a larger bin" do
159
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
160
+ 0,
161
+ 80083200)])
162
+ l.size.should == 8
163
+ end
164
+ it "returns no blocks given a range outside" do
165
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
166
+ 80083200,
167
+ 80083300)])
168
+ end
169
+ after(:each) do
170
+ if @idx
171
+ @idx.db.close
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ describe "#overlaps?" do
178
+ before(:each) do
179
+ @idx = KyotoIndex.new('%')
180
+ end
181
+ def check_overlap(x, y)
182
+ i = x[0]...x[1]
183
+ @idx.overlaps?(i, y[0], y[1])
184
+ end
185
+ it "handles equal intervals" do
186
+ check_overlap([0, 10],
187
+ [0, 10]).should be_true
188
+ end
189
+ it "handles X contains Y" do
190
+ check_overlap([0, 10],
191
+ [0, 9]).should be_true
192
+ check_overlap([0, 10],
193
+ [1, 9]).should be_true
194
+ check_overlap([0, 10],
195
+ [1, 10]).should be_true
196
+ end
197
+ it "handles Y contains X" do
198
+ check_overlap([0, 9],
199
+ [0, 10]).should be_true
200
+ check_overlap([1, 9],
201
+ [0, 10]).should be_true
202
+ check_overlap([1, 10],
203
+ [0, 10]).should be_true
204
+ end
205
+ it "handles partial overlap" do
206
+ check_overlap([0, 9],
207
+ [1, 10]).should be_true
208
+ check_overlap([1, 10],
209
+ [0, 9]).should be_true
210
+ end
211
+ it "handles end cases" do
212
+ check_overlap([0, 10],
213
+ [10, 15]).should be_false
214
+ check_overlap([10, 15],
215
+ [0, 10]).should be_false
216
+ end
217
+ it "handles separated intervals" do
218
+ check_overlap([0, 10], [15, 20]).should be_false
219
+ check_overlap([15, 20], [0, 10]).should be_false
220
+ end
221
+ after(:each) do
222
+ @idx.db.close
223
+ end
224
+ end
225
+
226
+ describe "#entries_for" do
227
+ before(:each) do
228
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
229
+ @block = @p.parse_block
230
+ @idx = KyotoIndex.new('%')
231
+ end
232
+ context "single ref seq" do
233
+ before(:each) do
234
+ @idx.index_sequences = { 'mm8.chr7' => 0 }
235
+ @e = @idx.entries_for(@block)
236
+ end
237
+ it "gives the correct key data" do
238
+ _, seq, bin, i_start, i_end = @e.keys.first.unpack("CCS>L>L>")
239
+ seq.should == 0
240
+ bin.should == 1195
241
+ i_start.should == 80082334
242
+ i_end.should == 80082368
243
+ end
244
+ it "gives the correct offset" do
245
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
246
+ b_offset.should == 16
247
+ end
248
+ it "gives the correct length" do
249
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
250
+ b_len.should == 1087
251
+ end
252
+ end
253
+ after(:each) do
254
+ @p.f.close
255
+ @idx.db.close
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ describe "#species" do
262
+ before(:each) do
263
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
264
+ @idx = KyotoIndex.build(@p, '%')
265
+ end
266
+ shared_examples "species" do
267
+ it "records the correct number of species" do
268
+ @idx.species.size.should == 11
269
+ end
270
+ it "sets species_max_id correctly" do
271
+ @idx.species_max_id.should == 10
272
+ end
273
+ end
274
+ describe "after building index" do
275
+ include_examples "species"
276
+ it "records species in order" do
277
+ @idx.db["species:mm8"].should == "0"
278
+ end
279
+ end
280
+ describe "after loading index" do
281
+ before(:each) { @idx = @idx.reopen }
282
+ include_examples "species"
283
+ end
284
+ end
285
+
286
+ describe "Filter classes" do
287
+ before(:each) do
288
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
289
+ @idx = KyotoIndex.build(@p, '%')
290
+ end
291
+
292
+ describe AllSpeciesFilter do
293
+ def fake_entry_with(species_l)
294
+ ids = species_l.collect {|s| @idx.species.fetch(s)}
295
+ vec = ids.collect { |id| 1 << id }.reduce(0, :|)
296
+ return ['', [0, 0, 0, 0, vec].pack(KyotoIndex::VAL_FMT)]
297
+ end
298
+
299
+ context "with an empty set" do
300
+ before(:each) do
301
+ @filter = AllSpeciesFilter.new([], @idx)
302
+ end
303
+ it "matches anything" do
304
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
305
+ @filter.match(e).should be_true
306
+ end
307
+ end
308
+ context "with [mm8 rn4]" do
309
+ before(:each) do
310
+ @filter = AllSpeciesFilter.new(%w(mm8 rn4), @idx)
311
+ end
312
+ it "does not match an empty entry" do
313
+ e = fake_entry_with(%w())
314
+ KVHelpers.extract_species_vec(e).should == 0
315
+ @filter.bs.should_not == 0
316
+ @filter.match(e).should be_false
317
+ end
318
+ it "does not match an entry with mm8" do
319
+ e = fake_entry_with(%w(mm8))
320
+ @filter.match(e).should be_false
321
+ end
322
+ it "does not match an entry with mm8 oryCun1" do
323
+ e = fake_entry_with(%w(mm8 oryCun1))
324
+ @filter.match(e).should be_false
325
+ end
326
+ it "matches an entry with mm8 rn4" do
327
+ e = fake_entry_with(%w(mm8 rn4))
328
+ @filter.match(e).should be_true
329
+ end
330
+ it "does not match an entry with mm8 rn4 oryCun1" do
331
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
332
+ @filter.match(e).should be_true
333
+ end
334
+ end
335
+ end # AllSpeciesFilter
336
+
337
+ describe AtLeastNSequencesFilter do
338
+ def fake_entry_with(n)
339
+ return ['', [0, 0, 0, n, 0].pack(KyotoIndex::VAL_FMT)]
340
+ end
341
+ context "n = 3" do
342
+ before(:each) do
343
+ @filter = AtLeastNSequencesFilter.new(3, @idx)
344
+ end
345
+ it "does not match 2 sequences" do
346
+ e = fake_entry_with(2)
347
+ @filter.match(e).should be_false
348
+ end
349
+ it "matches 3 sequences" do
350
+ e = fake_entry_with(3)
351
+ @filter.match(e).should be_true
352
+ end
353
+ end
354
+ end # AtLeastNSequencesFilter
355
+
356
+ after(:each) do
357
+ @idx.close
358
+ end
359
+ end # filter classes
360
+
361
+ end # module MAF
362
+
363
+ end # module Bio