bio-maf 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ maf_tile(1) -- synthesize an alignment for a given region
2
+ =========================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
7
+
8
+ `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
9
+
10
+ ## DESCRIPTION
11
+
12
+ **maf_tile** takes a MAF file with index (generated by maf_index(1)),
13
+ extracts alignment blocks overlapping the given genomic interval, and
14
+ constructs a single alignment block covering the entire interval for
15
+ the specified species. Optionally, any gaps in coverage of the MAF
16
+ file's reference sequence can be filled in from a FASTA sequence file.
17
+
18
+ If a single interval is specified, the output will be written to
19
+ stdout in FASTA format. If the `--output-base` option is specified,
20
+ `_<start>:<end>.fa` will be appended to the given <base> parameter and
21
+ used to construct the output path. If a BED file is specified with
22
+ `--bed`, `--output-base` is also required.
23
+
24
+ Species can be renamed for output by specifying them as SPECIES:NAME;
25
+ the first component will be used to select the species from the MAF
26
+ file, and the second will be used in the FASTA description line for
27
+ output.
28
+
29
+ ## OPTIONS
30
+
31
+ * `-r`, `--reference SEQ`:
32
+ The FASTA reference sequence file given, which may be gzipped, will
33
+ be used to fill in any gaps between alignment blocks.
34
+
35
+ * `-i`, `--interval BEGIN:END`:
36
+ The given zero-based genomic interval will be used to select
37
+ alignment blocks from the MAF file.
38
+
39
+ * `-s`, `--species SPECIES[:NAME]`:
40
+ The given species will be selected for output. If given as
41
+ `species:name`, it will appear in the FASTA output as <name>.
42
+
43
+ * `-b`, `--bed BED`:
44
+ The given BED file will be used to provide a list of intervals to
45
+ process. If present, `--interval` will be ignored and
46
+ `--output-base` must be given as well.
47
+
48
+ * `-o`, `--output-base BASE`:
49
+ The given path will be used as the base name for output files, as
50
+ described above.
51
+
52
+ ## EXAMPLES
53
+
54
+ Generate an alignment of the `hg19`, `petMar1`, and `ornAna1`
55
+ sequences from `chrY.maf` over the interval 14400 to 15000 on the
56
+ reference sequence of the MAF file. Fills in gaps from
57
+ `chrY.refseq.fa.gz`. Writes FASTA output to stdout.
58
+
59
+ $ maf_tile --reference ~/maf/chrY.refseq.fa.gz \
60
+ --interval 14400:15000 \
61
+ -s hg19:human -s petMar1 -s ornAna1 \
62
+ chrY.maf chrY.kct
63
+ >human
64
+ GGGTGACGAAAAGAGCCGA-----[...]
65
+ >petMar1
66
+ gagtgccggggagtgccggggagt[...]
67
+ >ornAna1
68
+ AGGGATCTGGGAATTCTGG-----[...]
69
+
70
+ Write out a FASTA file for each interval in the given BED file,
71
+ prefixed with `/tmp/mm8`, and without filling in data from a reference
72
+ sequence:
73
+
74
+ $ maf_tile --bed /tmp/mm8.bed --output-base /tmp/mm8 \
75
+ -s mm8:mouse -s rn4:rat -s hg18:human \
76
+ mm8_chr7_tiny.maf mm8_chr7_tiny.kct
77
+
78
+ ## FILES
79
+
80
+ The output is generated in FASTA format, with one sequence per
81
+ species.
82
+
83
+ The input <maf> file must be a Multiple Alignment Format file.
84
+
85
+ The <index> must be a MAF index built with maf_index(1).
86
+
87
+ If `--bed` <bed> is specified, its argument must be a BED file. Only
88
+ the second and third columns will be used, to specify the zero-based
89
+ start and end positions of intervals.
90
+
91
+ ## ENVIRONMENT
92
+
93
+ `maf_tile` is a Ruby program and relies on ordinary Ruby environment
94
+ variables.
95
+
96
+ ## COPYRIGHT
97
+
98
+ `maf_tile` is copyright (C) 2012 Clayton Wheeler.
99
+
100
+ ## SEE ALSO
101
+
102
+ maf_index(1), ruby(1)
103
+
104
+ * <https://github.com/csw/bioruby-maf/>
@@ -228,6 +228,7 @@ module Bio
228
228
  @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
229
229
  @block = @p.parse_block
230
230
  @idx = KyotoIndex.new('%')
231
+ @idx.ref_seq = 'mm8.chr7'
231
232
  end
232
233
  context "single ref seq" do
233
234
  before(:each) do
@@ -25,6 +25,99 @@ module Bio
25
25
  it "provides arbitrary parameters"
26
26
  end
27
27
 
28
+ describe Block do
29
+ describe "#find_gaps" do
30
+ it "finds a single 14-base gap" do
31
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
32
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
33
+ block = p.parse_block
34
+ gaps = block.find_gaps
35
+ gaps.size.should == 1
36
+ gaps[0][0].should == 34
37
+ gaps[0][1].should == 14
38
+ end
39
+ end
40
+ describe "#remove_gaps!" do
41
+ it "removes a single 14-base gap" do
42
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
43
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
44
+ block = p.parse_block
45
+ block.sequences.size.should == 5
46
+ block.text_size.should == 54
47
+ block.remove_gaps!
48
+ block.text_size.should == 40
49
+ end
50
+ end
51
+ end
52
+
53
+ describe Sequence do
54
+ before(:each) do
55
+ @parser = DummyParser.new
56
+ end
57
+
58
+ describe "#gapped?" do
59
+ it "is false for sequences with no gaps" do
60
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
61
+ s = @parser.parse_seq_line(line, nil)
62
+ s.gapped?.should be_false
63
+ end
64
+ it "is true for sequences with gaps" do
65
+ line = "s human_unc 9077 8 + 10998 AC-AGTATT"
66
+ s = @parser.parse_seq_line(line, nil)
67
+ s.gapped?.should be_true
68
+ end
69
+ end
70
+
71
+ describe "#text_range" do
72
+ it "returns 0...text.size for a spanning interval" do
73
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
74
+ s = @parser.parse_seq_line(line, nil)
75
+ range = s.text_range(9077...(9077 + 8))
76
+ range.should == (0...(s.text.size))
77
+ end
78
+ it "returns 0...text.size for a gapped spanning interval" do
79
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
80
+ s = @parser.parse_seq_line(line, nil)
81
+ range = s.text_range(9077...(9077 + 8))
82
+ range.should == (0...(s.text.size))
83
+ end
84
+ it "handles a leading subset" do
85
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
86
+ s = @parser.parse_seq_line(line, nil)
87
+ range = s.text_range(9077...(9077 + 2))
88
+ range.should == (0...2)
89
+ end
90
+ it "handles a trailing subset" do
91
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
92
+ s = @parser.parse_seq_line(line, nil)
93
+ range = s.text_range(9079...9085)
94
+ range.should == (2...8)
95
+ end
96
+ it "handles a gap in the middle" do
97
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
98
+ s = @parser.parse_seq_line(line, nil)
99
+ range = s.text_range(9078...(9077 + 8))
100
+ range.should == (1...(s.text.size))
101
+ end
102
+ it "errors on a range starting before" do
103
+ expect {
104
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
105
+ s = @parser.parse_seq_line(line, nil)
106
+ range = s.text_range(9076...(9077 + 8))
107
+ }.to raise_error
108
+ end
109
+ it "errors on a range ending after" do
110
+ expect {
111
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
112
+ s = @parser.parse_seq_line(line, nil)
113
+ range = s.text_range(9076...(9077 + 9))
114
+ }.to raise_error
115
+ end
116
+
117
+ end
118
+
119
+ end
120
+
28
121
  describe ParseContext do
29
122
  it "tracks the last block position"
30
123
  end
@@ -206,6 +299,16 @@ module Bio
206
299
  @p.sequence_filter = { :only_species => %w(mm8 hg18) }
207
300
  @p.parse_block.sequences.size.should == 2
208
301
  end
302
+ it "sets filtered? when modified" do
303
+ @p.sequence_filter = { :only_species => %w(mm8 rn4) }
304
+ @p.parse_block.filtered?.should be_true
305
+ end
306
+ it "does not set filtered? when unmodified" do
307
+ @p.sequence_filter = {
308
+ :only_species => %w(mm8 rn4 oryCun1 hg18 hg181)
309
+ }
310
+ @p.parse_block.filtered?.should be_false
311
+ end
209
312
  end
210
313
 
211
314
  context "at end of file" do
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio::MAF
4
+
5
+ describe Tiler do
6
+
7
+ describe "#runs" do
8
+ it "returns a uniform run properly" do
9
+ a = Array.new(10, 'a')
10
+ runs = Tiler.new.enum_for(:runs, a).to_a
11
+ runs.should == [[0...10, 'a']]
12
+ end
13
+ it "yields a trailing item" do
14
+ a = Array.new(10, 'a')
15
+ a.fill('b', 8...10)
16
+ runs = Tiler.new.enum_for(:runs, a).to_a
17
+ runs.should == [[0...8, 'a'], [8...10, 'b']]
18
+ end
19
+ it "handles mixed contents" do
20
+ spec = [[0...2, 'a'],
21
+ [2...3, 'b'],
22
+ [3...4, 'c'],
23
+ [4...7, 'd']]
24
+ a = Array.new(7, nil)
25
+ spec.each { |range, obj| a.fill(obj, range) }
26
+ runs = Tiler.new.enum_for(:runs, a).to_a
27
+ runs.should == spec
28
+ end
29
+ it "handles overwrites" do
30
+ spec = [[0...7, 'a'],
31
+ [2...5, 'b'],
32
+ [3...4, 'c'],
33
+ [4...7, 'd']]
34
+ a = Array.new(7, nil)
35
+ spec.each { |range, obj| a.fill(obj, range) }
36
+ runs = Tiler.new.enum_for(:runs, a).to_a
37
+ runs.should == [[0...2, 'a'],
38
+ [2...3, 'b'],
39
+ [3...4, 'c'],
40
+ [4...7, 'd']]
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ describe FASTARangeReader do
47
+ describe "#read" do
48
+ before(:each) do
49
+ @r = FASTARangeReader.new('test/data/gap-sp1.fa')
50
+ @s = 'CCAGGATGCTGGGCTGAGGGCAGTTGTGTCAGGGCGGTCCGGTGCAGGCA'
51
+ end
52
+
53
+ def check_range(z_start, z_end)
54
+ @r.read_interval(z_start, z_end).should == @s.slice(z_start...z_end)
55
+ end
56
+
57
+ it "returns the entire sequence" do
58
+ check_range(0, 50)
59
+ end
60
+ it "returns an entire line" do
61
+ check_range(10, 20)
62
+ end
63
+ it "returns arbitrary components" do
64
+ check_range(17, 41)
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,6 @@
1
+ > sp1.chr1
2
+ CCAGGATGCT
3
+ GGGCTGAGGG
4
+ CAGTTGTGTC
5
+ AGGGCGGTCC
6
+ GGTGCAGGCA
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-maf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -62,11 +62,13 @@ dependencies:
62
62
  description: Multiple Alignment Format parser for BioRuby.
63
63
  email: cswh@umich.edu
64
64
  executables:
65
+ - find_overlaps
65
66
  - maf_count
66
67
  - maf_dump_blocks
67
68
  - maf_extract_ranges_count
68
69
  - maf_index
69
70
  - maf_parse_bench
71
+ - maf_tile
70
72
  - maf_to_fasta
71
73
  - maf_write
72
74
  - random_ranges
@@ -76,6 +78,7 @@ extra_rdoc_files:
76
78
  - README.md
77
79
  files:
78
80
  - .document
81
+ - .gitignore
79
82
  - .simplecov
80
83
  - .travis.yml
81
84
  - .yardopts
@@ -90,20 +93,27 @@ files:
90
93
  - benchmarks/read_bench
91
94
  - benchmarks/sort_bench
92
95
  - benchmarks/split_bench
96
+ - bin/find_overlaps
93
97
  - bin/maf_count
94
98
  - bin/maf_dump_blocks
95
99
  - bin/maf_extract_ranges_count
96
100
  - bin/maf_index
97
101
  - bin/maf_parse_bench
102
+ - bin/maf_tile
98
103
  - bin/maf_to_fasta
99
104
  - bin/maf_write
100
105
  - bin/random_ranges
106
+ - bio-maf.gemspec
107
+ - features/gap-filling.feature
108
+ - features/gap-removal.feature
101
109
  - features/maf-indexing.feature
102
110
  - features/maf-output.feature
103
111
  - features/maf-parsing.feature
104
112
  - features/maf-querying.feature
105
113
  - features/maf-to-fasta.feature
106
114
  - features/step_definitions/convert_steps.rb
115
+ - features/step_definitions/gap-filling_steps.rb
116
+ - features/step_definitions/gap_removal_steps.rb
107
117
  - features/step_definitions/index_steps.rb
108
118
  - features/step_definitions/output_steps.rb
109
119
  - features/step_definitions/parse_steps.rb
@@ -115,8 +125,10 @@ files:
115
125
  - lib/bio-maf/maf.rb
116
126
  - lib/bio/maf.rb
117
127
  - lib/bio/maf/index.rb
128
+ - lib/bio/maf/maf.rb
118
129
  - lib/bio/maf/parser.rb
119
130
  - lib/bio/maf/struct.rb
131
+ - lib/bio/maf/tiler.rb
120
132
  - lib/bio/maf/writer.rb
121
133
  - lib/bio/ucsc.rb
122
134
  - lib/bio/ucsc/genomic-interval-bin.rb
@@ -125,17 +137,21 @@ files:
125
137
  - man/maf_index.1
126
138
  - man/maf_index.1.markdown
127
139
  - man/maf_index.1.ronn
140
+ - man/maf_tile.1
141
+ - man/maf_tile.1.ronn
128
142
  - man/maf_to_fasta.1
129
143
  - man/maf_to_fasta.1.ronn
130
144
  - spec/bio/maf/index_spec.rb
131
145
  - spec/bio/maf/parser_spec.rb
132
146
  - spec/bio/maf/struct_spec.rb
147
+ - spec/bio/maf/tiler_spec.rb
133
148
  - spec/spec_helper.rb
134
149
  - test/data/big-block.maf
135
150
  - test/data/chr22_ieq.maf
136
151
  - test/data/chrY-1block.maf
137
152
  - test/data/empty
138
153
  - test/data/empty.db
154
+ - test/data/gap-sp1.fa
139
155
  - test/data/mm8_chr7_tiny.kct
140
156
  - test/data/mm8_chr7_tiny.maf
141
157
  - test/data/mm8_mod_a.maf
@@ -165,7 +181,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
165
181
  version: '0'
166
182
  segments:
167
183
  - 0
168
- hash: -2212073295752934712
184
+ hash: 882426960615932212
169
185
  required_rubygems_version: !ruby/object:Gem::Requirement
170
186
  none: false
171
187
  requirements:
@@ -178,4 +194,43 @@ rubygems_version: 1.8.24
178
194
  signing_key:
179
195
  specification_version: 3
180
196
  summary: MAF parser for BioRuby
181
- test_files: []
197
+ test_files:
198
+ - features/gap-filling.feature
199
+ - features/gap-removal.feature
200
+ - features/maf-indexing.feature
201
+ - features/maf-output.feature
202
+ - features/maf-parsing.feature
203
+ - features/maf-querying.feature
204
+ - features/maf-to-fasta.feature
205
+ - features/step_definitions/convert_steps.rb
206
+ - features/step_definitions/gap-filling_steps.rb
207
+ - features/step_definitions/gap_removal_steps.rb
208
+ - features/step_definitions/index_steps.rb
209
+ - features/step_definitions/output_steps.rb
210
+ - features/step_definitions/parse_steps.rb
211
+ - features/step_definitions/query_steps.rb
212
+ - features/step_definitions/ucsc_bin_steps.rb
213
+ - features/support/env.rb
214
+ - features/ucsc-bins.feature
215
+ - spec/bio/maf/index_spec.rb
216
+ - spec/bio/maf/parser_spec.rb
217
+ - spec/bio/maf/struct_spec.rb
218
+ - spec/bio/maf/tiler_spec.rb
219
+ - spec/spec_helper.rb
220
+ - test/data/big-block.maf
221
+ - test/data/chr22_ieq.maf
222
+ - test/data/chrY-1block.maf
223
+ - test/data/empty
224
+ - test/data/empty.db
225
+ - test/data/gap-sp1.fa
226
+ - test/data/mm8_chr7_tiny.kct
227
+ - test/data/mm8_chr7_tiny.maf
228
+ - test/data/mm8_mod_a.maf
229
+ - test/data/mm8_single.maf
230
+ - test/data/mm8_subset_a.maf
231
+ - test/data/t1-bad1.maf
232
+ - test/data/t1.fasta
233
+ - test/data/t1.maf
234
+ - test/data/t1a.maf
235
+ - test/helper.rb
236
+ - test/test_bio-maf.rb