bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,83 @@
1
+ maf_index(1) -- build and examine MAF indexes
2
+ =============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_index` [-t] <maf> <index><br>
7
+ `maf_index` `-d`|`--dump` <index>
8
+
9
+ ## DESCRIPTION
10
+
11
+ **maf_index** is part of the bioruby-maf library and creates
12
+ Kyoto Cabinet indexes for Multiple Alignment Format (MAF)
13
+ files. These indexes enable other MAF tools to selectively extract
14
+ alignment blocks of interest.
15
+
16
+ In its default mode, `maf_index` parses the <maf> file given as an
17
+ argument and creates an index in <index>.
18
+
19
+ The index data is stored in binary form, so with the `--dump`
20
+ argument, `maf_index` can dump out the index data in human-readable
21
+ form for debugging.
22
+
23
+ ## FILES
24
+
25
+ The <maf> input file must be a valid MAF file of any length.
26
+
27
+ The index created is a Kyoto Cabinet TreeDB (B+ tree) database;
28
+ <index> must have a `.kct` extension.
29
+
30
+ ## OPTIONS
31
+
32
+ TODO
33
+
34
+ * `-d`, `--dump`:
35
+ Instead of creating an index, dump out the given <index> in
36
+ human-readable form. Index records will appear like:
37
+
38
+ 0 [bin 1195] 80082334:80082368
39
+ offset 16, length 1087
40
+ text size: 54
41
+ sequences in block: 10
42
+ species vector: 00000000000003ff
43
+
44
+ * `-t`, `--threaded`:
45
+ Use a separate reader thread to do I/O in parallel with
46
+ parsing. Only useful on JRuby.
47
+
48
+ * `--time`:
49
+ Print elapsed time for index creation. Mainly useful for measuring
50
+ performance with different Ruby implementations, I/O subsystems,
51
+ etc.
52
+
53
+ ## EXAMPLES
54
+
55
+ Build an index on a MAF file:
56
+
57
+ $ maf_index chr22.maf chr22.kct
58
+
59
+ Dump out an index:
60
+
61
+ $ maf_index -d chr22.kct > /tmp/chr22.dump
62
+
63
+ ## ENVIRONMENT
64
+
65
+ `maf_index` is a Ruby program and relies on ordinary Ruby environment
66
+ variables.
67
+
68
+ ## BUGS
69
+
70
+ `maf_index` does not currently allow Kyoto Cabinet database parameters
71
+ to be set.
72
+
73
+ ## COPYRIGHT
74
+
75
+ `maf_index` is copyright (C) 2012 Clayton Wheeler.
76
+
77
+ ## SEE ALSO
78
+
79
+ ruby(1), kctreemgr(1)
80
+
81
+ * <https://github.com/csw/bioruby-maf/>
82
+ * <http://fallabs.com/kyotocabinet/>
83
+
@@ -0,0 +1,53 @@
1
+ .\" generated with Ronn/v0.7.3
2
+ .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
+ .
4
+ .TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
5
+ .
6
+ .SH "NAME"
7
+ \fBmaf_to_fasta\fR \- convert MAF file to FASTA
8
+ .
9
+ .SH "SYNOPSIS"
10
+ \fBmaf_to_fasta\fR [\fIoptions\fR\.\.\.] \fImaf\fR \fIfasta\fR
11
+ .
12
+ .SH "DESCRIPTION"
13
+ \fBmaf_to_fasta\fR, part of the bioruby\-maf library, converts Multiple Alignment Format (MAF) files to FASTA format\. It does not attempt to combine alignment blocks, but simply writes out each sequence in order\.
14
+ .
15
+ .SH "FILES"
16
+ The \fImaf\fR input file must be a valid MAF file of any length\.
17
+ .
18
+ .P
19
+ The \fIfasta\fR output file will be written in FASTA format\.
20
+ .
21
+ .SH "OPTIONS"
22
+ The options are only useful for performance measurement\.
23
+ .
24
+ .SH "EXAMPLES"
25
+ Convert a MAF file to FASTA:
26
+ .
27
+ .IP "" 4
28
+ .
29
+ .nf
30
+
31
+ $ maf_to_fasta chrY\.maf chrY\.fa
32
+ .
33
+ .fi
34
+ .
35
+ .IP "" 0
36
+ .
37
+ .SH "ENVIRONMENT"
38
+ \fBmaf_to_fasta\fR is a Ruby program and relies on ordinary Ruby environment variables\.
39
+ .
40
+ .SH "BUGS"
41
+ \fBmaf_to_fasta\fR should provide flexibility in selecting the alignment blocks and sequences to convert, and perhaps allow alignment blocks to be spliced together\.
42
+ .
43
+ .SH "COPYRIGHT"
44
+ \fBmaf_to_fasta\fR is copyright (C) 2012 Clayton Wheeler\.
45
+ .
46
+ .SH "SEE ALSO"
47
+ ruby(1)
48
+ .
49
+ .IP "\(bu" 4
50
+ \fIhttps://github\.com/csw/bioruby\-maf/\fR
51
+ .
52
+ .IP "" 0
53
+
@@ -0,0 +1,51 @@
1
+ maf_to_fasta(1) -- convert MAF file to FASTA
2
+ ============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_to_fasta` [<options>...] <maf> <fasta>
7
+
8
+ ## DESCRIPTION
9
+
10
+ **maf_to_fasta**, part of the bioruby-maf library, converts Multiple
11
+ Alignment Format (MAF) files to FASTA format. It does not attempt to
12
+ combine alignment blocks, but simply writes out each sequence in
13
+ order.
14
+
15
+ ## FILES
16
+
17
+ The <maf> input file must be a valid MAF file of any length.
18
+
19
+ The <fasta> output file will be written in FASTA format.
20
+
21
+ ## OPTIONS
22
+
23
+ The options are only useful for performance measurement.
24
+
25
+ ## EXAMPLES
26
+
27
+ Convert a MAF file to FASTA:
28
+
29
+ $ maf_to_fasta chrY.maf chrY.fa
30
+
31
+ ## ENVIRONMENT
32
+
33
+ `maf_to_fasta` is a Ruby program and relies on ordinary Ruby
34
+ environment variables.
35
+
36
+ ## BUGS
37
+
38
+ `maf_to_fasta` should provide flexibility in selecting the alignment
39
+ blocks and sequences to convert, and perhaps allow alignment blocks to
40
+ be spliced together.
41
+
42
+ ## COPYRIGHT
43
+
44
+ `maf_to_fasta` is copyright (C) 2012 Clayton Wheeler.
45
+
46
+ ## SEE ALSO
47
+
48
+ ruby(1)
49
+
50
+ * <https://github.com/csw/bioruby-maf/>
51
+
@@ -0,0 +1,363 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe KyotoIndex do
7
+ def has_at_least_n_with_prefix(n, start)
8
+ @idx.db.cursor_process do |cur|
9
+ i = 0
10
+ cur.jump(start)
11
+ k = cur.get_key(true)
12
+ while k && k.start_with?(start) && i < n
13
+ i += 1
14
+ end
15
+ return i == n
16
+ end
17
+ end
18
+
19
+ describe ".build" do
20
+ it "accepts '%' as a path for an in-memory DB" do
21
+ expect {
22
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
23
+ @idx = KyotoIndex.build(@p, '%')
24
+ @p.f.close
25
+ @idx.close
26
+ }.not_to raise_error
27
+ end
28
+ it "accepts .kct paths"
29
+ it "rejects other paths"
30
+ context "mm8_chr7" do
31
+ before(:each) do
32
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
33
+ @idx = KyotoIndex.build(@p, '%')
34
+ end
35
+ it "uses the first sequence appearing as the reference sequence" do
36
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
37
+ end
38
+ it "creates 8 index entries" do
39
+ has_at_least_n_with_prefix(8, "\xFF\x00").should be_true
40
+ end
41
+ it "stores the sequence IDs" do
42
+ @idx.db.match_prefix("sequence:").size.should == 1
43
+ end
44
+ it "stores the sequence IDs" do
45
+ @idx.db.get("sequence:mm8.chr7").should == "0"
46
+ end
47
+ describe "loads sequence data correctly" do
48
+ before(:each) { @idx = @idx.reopen }
49
+ it "uses the first sequence appearing as the reference sequence" do
50
+ @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
51
+ end
52
+ end
53
+ after(:each) do
54
+ @idx.db.close
55
+ end
56
+ end
57
+ end
58
+
59
+ describe ".open" do
60
+ it "opens an existing index successfully" do
61
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
62
+ @idx.db.count.should be > 8
63
+ end
64
+ it "populates #index_sequences" do
65
+ @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
66
+ @idx.index_sequences.size.should be > 0
67
+ @idx.index_sequences['mm8.chr7'].should == 0
68
+ end
69
+ after(:each) do
70
+ @idx.db.close if @idx
71
+ end
72
+ end
73
+
74
+ describe "#find" do
75
+ context "mm8_chr7" do
76
+ before(:each) do
77
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
78
+ @idx = KyotoIndex.build(@p, '%')
79
+ end
80
+
81
+ it "returns a block given a range contained in the block" do
82
+ l = @idx.find([GenomicInterval.zero_based('mm8.chr7',
83
+ 80082334,
84
+ 80082338)],
85
+ @p).to_a
86
+ l.size.should == 1
87
+ l[0].offset.should == 16
88
+ end
89
+
90
+ after(:each) do
91
+ @idx.db.close
92
+ @p.f.close
93
+ end
94
+ end
95
+ end
96
+
97
+ describe "#fetch_list" do
98
+ context "mm8_chr7" do
99
+ before(:each) do
100
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
101
+ @idx = KyotoIndex.build(@p, '%')
102
+ end
103
+ it "returns a block spec given a range contained in the block" do
104
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
105
+ 80082334,
106
+ 80082338)])
107
+ l.size.should == 1
108
+ l[0][0].should == 16 # block offset
109
+ end
110
+ it "returns a block spec with correct size" do
111
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
112
+ 80082334,
113
+ 80082338)])
114
+ l.size.should == 1
115
+ l[0][0].should == 16 # block offset
116
+ l[0][1].should == 1087 # block size
117
+ end
118
+ it "returns a block spec given its range exactly" do
119
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
120
+ 80082334,
121
+ 80082368)])
122
+ l.size.should == 1
123
+ l[0][0].should == 16 # block offset
124
+ end
125
+ it "returns specs for adjoining blocks given a range partially in each" do
126
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
127
+ 80082360,
128
+ 80082370)])
129
+ l.size.should == 2
130
+ l.collect { |e| e[0] }.should == [16, 1103]
131
+ end
132
+ it "returns a block spec given a range ending in it" do
133
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
134
+ 80082330,
135
+ 80082339)])
136
+ l.size.should == 1
137
+ l[0][0].should == 16 # block offset
138
+ end
139
+ it "returns no block spec given a zero-based range ending at a block start" do
140
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
141
+ 80082330,
142
+ 80082334)])
143
+ l.size.should == 0
144
+ end
145
+ it "returns a block spec given a range beginning in it" do
146
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
147
+ 80083009,
148
+ 80083220)])
149
+ l.size.should == 1
150
+ l[0][0].should == 10113 # block offset
151
+ end
152
+ it "returns no block spec given a range beginning at its end" do
153
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
154
+ 80083156,
155
+ 80083200)])
156
+ l.size.should == 0
157
+ end
158
+ it "returns specs for all blocks given a range fitting a larger bin" do
159
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
160
+ 0,
161
+ 80083200)])
162
+ l.size.should == 8
163
+ end
164
+ it "returns no blocks given a range outside" do
165
+ l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
166
+ 80083200,
167
+ 80083300)])
168
+ end
169
+ after(:each) do
170
+ if @idx
171
+ @idx.db.close
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ describe "#overlaps?" do
178
+ before(:each) do
179
+ @idx = KyotoIndex.new('%')
180
+ end
181
+ def check_overlap(x, y)
182
+ i = x[0]...x[1]
183
+ @idx.overlaps?(i, y[0], y[1])
184
+ end
185
+ it "handles equal intervals" do
186
+ check_overlap([0, 10],
187
+ [0, 10]).should be_true
188
+ end
189
+ it "handles X contains Y" do
190
+ check_overlap([0, 10],
191
+ [0, 9]).should be_true
192
+ check_overlap([0, 10],
193
+ [1, 9]).should be_true
194
+ check_overlap([0, 10],
195
+ [1, 10]).should be_true
196
+ end
197
+ it "handles Y contains X" do
198
+ check_overlap([0, 9],
199
+ [0, 10]).should be_true
200
+ check_overlap([1, 9],
201
+ [0, 10]).should be_true
202
+ check_overlap([1, 10],
203
+ [0, 10]).should be_true
204
+ end
205
+ it "handles partial overlap" do
206
+ check_overlap([0, 9],
207
+ [1, 10]).should be_true
208
+ check_overlap([1, 10],
209
+ [0, 9]).should be_true
210
+ end
211
+ it "handles end cases" do
212
+ check_overlap([0, 10],
213
+ [10, 15]).should be_false
214
+ check_overlap([10, 15],
215
+ [0, 10]).should be_false
216
+ end
217
+ it "handles separated intervals" do
218
+ check_overlap([0, 10], [15, 20]).should be_false
219
+ check_overlap([15, 20], [0, 10]).should be_false
220
+ end
221
+ after(:each) do
222
+ @idx.db.close
223
+ end
224
+ end
225
+
226
+ describe "#entries_for" do
227
+ before(:each) do
228
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
229
+ @block = @p.parse_block
230
+ @idx = KyotoIndex.new('%')
231
+ end
232
+ context "single ref seq" do
233
+ before(:each) do
234
+ @idx.index_sequences = { 'mm8.chr7' => 0 }
235
+ @e = @idx.entries_for(@block)
236
+ end
237
+ it "gives the correct key data" do
238
+ _, seq, bin, i_start, i_end = @e.keys.first.unpack("CCS>L>L>")
239
+ seq.should == 0
240
+ bin.should == 1195
241
+ i_start.should == 80082334
242
+ i_end.should == 80082368
243
+ end
244
+ it "gives the correct offset" do
245
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
246
+ b_offset.should == 16
247
+ end
248
+ it "gives the correct length" do
249
+ b_offset, b_len = @e.values.first.unpack("Q>L>")
250
+ b_len.should == 1087
251
+ end
252
+ end
253
+ after(:each) do
254
+ @p.f.close
255
+ @idx.db.close
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ describe "#species" do
262
+ before(:each) do
263
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
264
+ @idx = KyotoIndex.build(@p, '%')
265
+ end
266
+ shared_examples "species" do
267
+ it "records the correct number of species" do
268
+ @idx.species.size.should == 11
269
+ end
270
+ it "sets species_max_id correctly" do
271
+ @idx.species_max_id.should == 10
272
+ end
273
+ end
274
+ describe "after building index" do
275
+ include_examples "species"
276
+ it "records species in order" do
277
+ @idx.db["species:mm8"].should == "0"
278
+ end
279
+ end
280
+ describe "after loading index" do
281
+ before(:each) { @idx = @idx.reopen }
282
+ include_examples "species"
283
+ end
284
+ end
285
+
286
+ describe "Filter classes" do
287
+ before(:each) do
288
+ @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
289
+ @idx = KyotoIndex.build(@p, '%')
290
+ end
291
+
292
+ describe AllSpeciesFilter do
293
+ def fake_entry_with(species_l)
294
+ ids = species_l.collect {|s| @idx.species.fetch(s)}
295
+ vec = ids.collect { |id| 1 << id }.reduce(0, :|)
296
+ return ['', [0, 0, 0, 0, vec].pack(KyotoIndex::VAL_FMT)]
297
+ end
298
+
299
+ context "with an empty set" do
300
+ before(:each) do
301
+ @filter = AllSpeciesFilter.new([], @idx)
302
+ end
303
+ it "matches anything" do
304
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
305
+ @filter.match(e).should be_true
306
+ end
307
+ end
308
+ context "with [mm8 rn4]" do
309
+ before(:each) do
310
+ @filter = AllSpeciesFilter.new(%w(mm8 rn4), @idx)
311
+ end
312
+ it "does not match an empty entry" do
313
+ e = fake_entry_with(%w())
314
+ KVHelpers.extract_species_vec(e).should == 0
315
+ @filter.bs.should_not == 0
316
+ @filter.match(e).should be_false
317
+ end
318
+ it "does not match an entry with mm8" do
319
+ e = fake_entry_with(%w(mm8))
320
+ @filter.match(e).should be_false
321
+ end
322
+ it "does not match an entry with mm8 oryCun1" do
323
+ e = fake_entry_with(%w(mm8 oryCun1))
324
+ @filter.match(e).should be_false
325
+ end
326
+ it "matches an entry with mm8 rn4" do
327
+ e = fake_entry_with(%w(mm8 rn4))
328
+ @filter.match(e).should be_true
329
+ end
330
+ it "does not match an entry with mm8 rn4 oryCun1" do
331
+ e = fake_entry_with(%w(mm8 rn4 oryCun1))
332
+ @filter.match(e).should be_true
333
+ end
334
+ end
335
+ end # AllSpeciesFilter
336
+
337
+ describe AtLeastNSequencesFilter do
338
+ def fake_entry_with(n)
339
+ return ['', [0, 0, 0, n, 0].pack(KyotoIndex::VAL_FMT)]
340
+ end
341
+ context "n = 3" do
342
+ before(:each) do
343
+ @filter = AtLeastNSequencesFilter.new(3, @idx)
344
+ end
345
+ it "does not match 2 sequences" do
346
+ e = fake_entry_with(2)
347
+ @filter.match(e).should be_false
348
+ end
349
+ it "matches 3 sequences" do
350
+ e = fake_entry_with(3)
351
+ @filter.match(e).should be_true
352
+ end
353
+ end
354
+ end # AtLeastNSequencesFilter
355
+
356
+ after(:each) do
357
+ @idx.close
358
+ end
359
+ end # filter classes
360
+
361
+ end # module MAF
362
+
363
+ end # module Bio