bio-maf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/lib/bio/maf.rb +2 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +58 -3
data/man/maf_tile.1.ronn
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
maf_tile(1) -- synthesize an alignment for a given region
|
2
|
+
=========================================================
|
3
|
+
|
4
|
+
## SYNOPSIS
|
5
|
+
|
6
|
+
`maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
|
7
|
+
|
8
|
+
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
|
9
|
+
|
10
|
+
## DESCRIPTION
|
11
|
+
|
12
|
+
**maf_tile** takes a MAF file with index (generated by maf_index(1)),
|
13
|
+
extracts alignment blocks overlapping the given genomic interval, and
|
14
|
+
constructs a single alignment block covering the entire interval for
|
15
|
+
the specified species. Optionally, any gaps in coverage of the MAF
|
16
|
+
file's reference sequence can be filled in from a FASTA sequence file.
|
17
|
+
|
18
|
+
If a single interval is specified, the output will be written to
|
19
|
+
stdout in FASTA format. If the `--output-base` option is specified,
|
20
|
+
`_<start>:<end>.fa` will be appended to the given <base> parameter and
|
21
|
+
used to construct the output path. If a BED file is specified with
|
22
|
+
`--bed`, `--output-base` is also required.
|
23
|
+
|
24
|
+
Species can be renamed for output by specifying them as SPECIES:NAME;
|
25
|
+
the first component will be used to select the species from the MAF
|
26
|
+
file, and the second will be used in the FASTA description line for
|
27
|
+
output.
|
28
|
+
|
29
|
+
## OPTIONS
|
30
|
+
|
31
|
+
* `-r`, `--reference SEQ`:
|
32
|
+
The FASTA reference sequence file given, which may be gzipped, will
|
33
|
+
be used to fill in any gaps between alignment blocks.
|
34
|
+
|
35
|
+
* `-i`, `--interval BEGIN:END`:
|
36
|
+
The given zero-based genomic interval will be used to select
|
37
|
+
alignment blocks from the MAF file.
|
38
|
+
|
39
|
+
* `-s`, `--species SPECIES[:NAME]`:
|
40
|
+
The given species will be selected for output. If given as
|
41
|
+
`species:name`, it will appear in the FASTA output as <name>.
|
42
|
+
|
43
|
+
* `-b`, `--bed BED`:
|
44
|
+
The given BED file will be used to provide a list of intervals to
|
45
|
+
process. If present, `--interval` will be ignored and
|
46
|
+
`--output-base` must be given as well.
|
47
|
+
|
48
|
+
* `-o`, `--output-base BASE`:
|
49
|
+
The given path will be used as the base name for output files, as
|
50
|
+
described above.
|
51
|
+
|
52
|
+
## EXAMPLES
|
53
|
+
|
54
|
+
Generate an alignment of the `hg19`, `petMar1`, and `ornAna1`
|
55
|
+
sequences from `chrY.maf` over the interval 14400 to 15000 on the
|
56
|
+
reference sequence of the MAF file. Fills in gaps from
|
57
|
+
`chrY.refseq.fa.gz`. Writes FASTA output to stdout.
|
58
|
+
|
59
|
+
$ maf_tile --reference ~/maf/chrY.refseq.fa.gz \
|
60
|
+
--interval 14400:15000 \
|
61
|
+
-s hg19:human -s petMar1 -s ornAna1 \
|
62
|
+
chrY.maf chrY.kct
|
63
|
+
>human
|
64
|
+
GGGTGACGAAAAGAGCCGA-----[...]
|
65
|
+
>petMar1
|
66
|
+
gagtgccggggagtgccggggagt[...]
|
67
|
+
>ornAna1
|
68
|
+
AGGGATCTGGGAATTCTGG-----[...]
|
69
|
+
|
70
|
+
Write out a FASTA file for each interval in the given BED file,
|
71
|
+
prefixed with `/tmp/mm8`, and without filling in data from a reference
|
72
|
+
sequence:
|
73
|
+
|
74
|
+
$ maf_tile --bed /tmp/mm8.bed --output-base /tmp/mm8 \
|
75
|
+
-s mm8:mouse -s rn4:rat -s hg18:human \
|
76
|
+
mm8_chr7_tiny.maf mm8_chr7_tiny.kct
|
77
|
+
|
78
|
+
## FILES
|
79
|
+
|
80
|
+
The output is generated in FASTA format, with one sequence per
|
81
|
+
species.
|
82
|
+
|
83
|
+
The input <maf> file must be a Multiple Alignment Format file.
|
84
|
+
|
85
|
+
The <index> must be a MAF index built with maf_index(1).
|
86
|
+
|
87
|
+
If `--bed` <bed> is specified, its argument must be a BED file. Only
|
88
|
+
the second and third columns will be used, to specify the zero-based
|
89
|
+
start and end positions of intervals.
|
90
|
+
|
91
|
+
## ENVIRONMENT
|
92
|
+
|
93
|
+
`maf_tile` is a Ruby program and relies on ordinary Ruby environment
|
94
|
+
variables.
|
95
|
+
|
96
|
+
## COPYRIGHT
|
97
|
+
|
98
|
+
`maf_tile` is copyright (C) 2012 Clayton Wheeler.
|
99
|
+
|
100
|
+
## SEE ALSO
|
101
|
+
|
102
|
+
maf_index(1), ruby(1)
|
103
|
+
|
104
|
+
* <https://github.com/csw/bioruby-maf/>
|
data/spec/bio/maf/index_spec.rb
CHANGED
data/spec/bio/maf/parser_spec.rb
CHANGED
@@ -25,6 +25,99 @@ module Bio
|
|
25
25
|
it "provides arbitrary parameters"
|
26
26
|
end
|
27
27
|
|
28
|
+
describe Block do
|
29
|
+
describe "#find_gaps" do
|
30
|
+
it "finds a single 14-base gap" do
|
31
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
32
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
33
|
+
block = p.parse_block
|
34
|
+
gaps = block.find_gaps
|
35
|
+
gaps.size.should == 1
|
36
|
+
gaps[0][0].should == 34
|
37
|
+
gaps[0][1].should == 14
|
38
|
+
end
|
39
|
+
end
|
40
|
+
describe "#remove_gaps!" do
|
41
|
+
it "removes a single 14-base gap" do
|
42
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
43
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
44
|
+
block = p.parse_block
|
45
|
+
block.sequences.size.should == 5
|
46
|
+
block.text_size.should == 54
|
47
|
+
block.remove_gaps!
|
48
|
+
block.text_size.should == 40
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe Sequence do
|
54
|
+
before(:each) do
|
55
|
+
@parser = DummyParser.new
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "#gapped?" do
|
59
|
+
it "is false for sequences with no gaps" do
|
60
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
61
|
+
s = @parser.parse_seq_line(line, nil)
|
62
|
+
s.gapped?.should be_false
|
63
|
+
end
|
64
|
+
it "is true for sequences with gaps" do
|
65
|
+
line = "s human_unc 9077 8 + 10998 AC-AGTATT"
|
66
|
+
s = @parser.parse_seq_line(line, nil)
|
67
|
+
s.gapped?.should be_true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe "#text_range" do
|
72
|
+
it "returns 0...text.size for a spanning interval" do
|
73
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
74
|
+
s = @parser.parse_seq_line(line, nil)
|
75
|
+
range = s.text_range(9077...(9077 + 8))
|
76
|
+
range.should == (0...(s.text.size))
|
77
|
+
end
|
78
|
+
it "returns 0...text.size for a gapped spanning interval" do
|
79
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
80
|
+
s = @parser.parse_seq_line(line, nil)
|
81
|
+
range = s.text_range(9077...(9077 + 8))
|
82
|
+
range.should == (0...(s.text.size))
|
83
|
+
end
|
84
|
+
it "handles a leading subset" do
|
85
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
86
|
+
s = @parser.parse_seq_line(line, nil)
|
87
|
+
range = s.text_range(9077...(9077 + 2))
|
88
|
+
range.should == (0...2)
|
89
|
+
end
|
90
|
+
it "handles a trailing subset" do
|
91
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
92
|
+
s = @parser.parse_seq_line(line, nil)
|
93
|
+
range = s.text_range(9079...9085)
|
94
|
+
range.should == (2...8)
|
95
|
+
end
|
96
|
+
it "handles a gap in the middle" do
|
97
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
98
|
+
s = @parser.parse_seq_line(line, nil)
|
99
|
+
range = s.text_range(9078...(9077 + 8))
|
100
|
+
range.should == (1...(s.text.size))
|
101
|
+
end
|
102
|
+
it "errors on a range starting before" do
|
103
|
+
expect {
|
104
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
105
|
+
s = @parser.parse_seq_line(line, nil)
|
106
|
+
range = s.text_range(9076...(9077 + 8))
|
107
|
+
}.to raise_error
|
108
|
+
end
|
109
|
+
it "errors on a range ending after" do
|
110
|
+
expect {
|
111
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
112
|
+
s = @parser.parse_seq_line(line, nil)
|
113
|
+
range = s.text_range(9076...(9077 + 9))
|
114
|
+
}.to raise_error
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
28
121
|
describe ParseContext do
|
29
122
|
it "tracks the last block position"
|
30
123
|
end
|
@@ -206,6 +299,16 @@ module Bio
|
|
206
299
|
@p.sequence_filter = { :only_species => %w(mm8 hg18) }
|
207
300
|
@p.parse_block.sequences.size.should == 2
|
208
301
|
end
|
302
|
+
it "sets filtered? when modified" do
|
303
|
+
@p.sequence_filter = { :only_species => %w(mm8 rn4) }
|
304
|
+
@p.parse_block.filtered?.should be_true
|
305
|
+
end
|
306
|
+
it "does not set filtered? when unmodified" do
|
307
|
+
@p.sequence_filter = {
|
308
|
+
:only_species => %w(mm8 rn4 oryCun1 hg18 hg181)
|
309
|
+
}
|
310
|
+
@p.parse_block.filtered?.should be_false
|
311
|
+
end
|
209
312
|
end
|
210
313
|
|
211
314
|
context "at end of file" do
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio::MAF
|
4
|
+
|
5
|
+
describe Tiler do
|
6
|
+
|
7
|
+
describe "#runs" do
|
8
|
+
it "returns a uniform run properly" do
|
9
|
+
a = Array.new(10, 'a')
|
10
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
11
|
+
runs.should == [[0...10, 'a']]
|
12
|
+
end
|
13
|
+
it "yields a trailing item" do
|
14
|
+
a = Array.new(10, 'a')
|
15
|
+
a.fill('b', 8...10)
|
16
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
17
|
+
runs.should == [[0...8, 'a'], [8...10, 'b']]
|
18
|
+
end
|
19
|
+
it "handles mixed contents" do
|
20
|
+
spec = [[0...2, 'a'],
|
21
|
+
[2...3, 'b'],
|
22
|
+
[3...4, 'c'],
|
23
|
+
[4...7, 'd']]
|
24
|
+
a = Array.new(7, nil)
|
25
|
+
spec.each { |range, obj| a.fill(obj, range) }
|
26
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
27
|
+
runs.should == spec
|
28
|
+
end
|
29
|
+
it "handles overwrites" do
|
30
|
+
spec = [[0...7, 'a'],
|
31
|
+
[2...5, 'b'],
|
32
|
+
[3...4, 'c'],
|
33
|
+
[4...7, 'd']]
|
34
|
+
a = Array.new(7, nil)
|
35
|
+
spec.each { |range, obj| a.fill(obj, range) }
|
36
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
37
|
+
runs.should == [[0...2, 'a'],
|
38
|
+
[2...3, 'b'],
|
39
|
+
[3...4, 'c'],
|
40
|
+
[4...7, 'd']]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe FASTARangeReader do
|
47
|
+
describe "#read" do
|
48
|
+
before(:each) do
|
49
|
+
@r = FASTARangeReader.new('test/data/gap-sp1.fa')
|
50
|
+
@s = 'CCAGGATGCTGGGCTGAGGGCAGTTGTGTCAGGGCGGTCCGGTGCAGGCA'
|
51
|
+
end
|
52
|
+
|
53
|
+
def check_range(z_start, z_end)
|
54
|
+
@r.read_interval(z_start, z_end).should == @s.slice(z_start...z_end)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "returns the entire sequence" do
|
58
|
+
check_range(0, 50)
|
59
|
+
end
|
60
|
+
it "returns an entire line" do
|
61
|
+
check_range(10, 20)
|
62
|
+
end
|
63
|
+
it "returns arbitrary components" do
|
64
|
+
check_range(17, 41)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
data/test/data/mm8_chr7_tiny.kct
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-maf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -62,11 +62,13 @@ dependencies:
|
|
62
62
|
description: Multiple Alignment Format parser for BioRuby.
|
63
63
|
email: cswh@umich.edu
|
64
64
|
executables:
|
65
|
+
- find_overlaps
|
65
66
|
- maf_count
|
66
67
|
- maf_dump_blocks
|
67
68
|
- maf_extract_ranges_count
|
68
69
|
- maf_index
|
69
70
|
- maf_parse_bench
|
71
|
+
- maf_tile
|
70
72
|
- maf_to_fasta
|
71
73
|
- maf_write
|
72
74
|
- random_ranges
|
@@ -76,6 +78,7 @@ extra_rdoc_files:
|
|
76
78
|
- README.md
|
77
79
|
files:
|
78
80
|
- .document
|
81
|
+
- .gitignore
|
79
82
|
- .simplecov
|
80
83
|
- .travis.yml
|
81
84
|
- .yardopts
|
@@ -90,20 +93,27 @@ files:
|
|
90
93
|
- benchmarks/read_bench
|
91
94
|
- benchmarks/sort_bench
|
92
95
|
- benchmarks/split_bench
|
96
|
+
- bin/find_overlaps
|
93
97
|
- bin/maf_count
|
94
98
|
- bin/maf_dump_blocks
|
95
99
|
- bin/maf_extract_ranges_count
|
96
100
|
- bin/maf_index
|
97
101
|
- bin/maf_parse_bench
|
102
|
+
- bin/maf_tile
|
98
103
|
- bin/maf_to_fasta
|
99
104
|
- bin/maf_write
|
100
105
|
- bin/random_ranges
|
106
|
+
- bio-maf.gemspec
|
107
|
+
- features/gap-filling.feature
|
108
|
+
- features/gap-removal.feature
|
101
109
|
- features/maf-indexing.feature
|
102
110
|
- features/maf-output.feature
|
103
111
|
- features/maf-parsing.feature
|
104
112
|
- features/maf-querying.feature
|
105
113
|
- features/maf-to-fasta.feature
|
106
114
|
- features/step_definitions/convert_steps.rb
|
115
|
+
- features/step_definitions/gap-filling_steps.rb
|
116
|
+
- features/step_definitions/gap_removal_steps.rb
|
107
117
|
- features/step_definitions/index_steps.rb
|
108
118
|
- features/step_definitions/output_steps.rb
|
109
119
|
- features/step_definitions/parse_steps.rb
|
@@ -115,8 +125,10 @@ files:
|
|
115
125
|
- lib/bio-maf/maf.rb
|
116
126
|
- lib/bio/maf.rb
|
117
127
|
- lib/bio/maf/index.rb
|
128
|
+
- lib/bio/maf/maf.rb
|
118
129
|
- lib/bio/maf/parser.rb
|
119
130
|
- lib/bio/maf/struct.rb
|
131
|
+
- lib/bio/maf/tiler.rb
|
120
132
|
- lib/bio/maf/writer.rb
|
121
133
|
- lib/bio/ucsc.rb
|
122
134
|
- lib/bio/ucsc/genomic-interval-bin.rb
|
@@ -125,17 +137,21 @@ files:
|
|
125
137
|
- man/maf_index.1
|
126
138
|
- man/maf_index.1.markdown
|
127
139
|
- man/maf_index.1.ronn
|
140
|
+
- man/maf_tile.1
|
141
|
+
- man/maf_tile.1.ronn
|
128
142
|
- man/maf_to_fasta.1
|
129
143
|
- man/maf_to_fasta.1.ronn
|
130
144
|
- spec/bio/maf/index_spec.rb
|
131
145
|
- spec/bio/maf/parser_spec.rb
|
132
146
|
- spec/bio/maf/struct_spec.rb
|
147
|
+
- spec/bio/maf/tiler_spec.rb
|
133
148
|
- spec/spec_helper.rb
|
134
149
|
- test/data/big-block.maf
|
135
150
|
- test/data/chr22_ieq.maf
|
136
151
|
- test/data/chrY-1block.maf
|
137
152
|
- test/data/empty
|
138
153
|
- test/data/empty.db
|
154
|
+
- test/data/gap-sp1.fa
|
139
155
|
- test/data/mm8_chr7_tiny.kct
|
140
156
|
- test/data/mm8_chr7_tiny.maf
|
141
157
|
- test/data/mm8_mod_a.maf
|
@@ -165,7 +181,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
165
181
|
version: '0'
|
166
182
|
segments:
|
167
183
|
- 0
|
168
|
-
hash:
|
184
|
+
hash: 882426960615932212
|
169
185
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
170
186
|
none: false
|
171
187
|
requirements:
|
@@ -178,4 +194,43 @@ rubygems_version: 1.8.24
|
|
178
194
|
signing_key:
|
179
195
|
specification_version: 3
|
180
196
|
summary: MAF parser for BioRuby
|
181
|
-
test_files:
|
197
|
+
test_files:
|
198
|
+
- features/gap-filling.feature
|
199
|
+
- features/gap-removal.feature
|
200
|
+
- features/maf-indexing.feature
|
201
|
+
- features/maf-output.feature
|
202
|
+
- features/maf-parsing.feature
|
203
|
+
- features/maf-querying.feature
|
204
|
+
- features/maf-to-fasta.feature
|
205
|
+
- features/step_definitions/convert_steps.rb
|
206
|
+
- features/step_definitions/gap-filling_steps.rb
|
207
|
+
- features/step_definitions/gap_removal_steps.rb
|
208
|
+
- features/step_definitions/index_steps.rb
|
209
|
+
- features/step_definitions/output_steps.rb
|
210
|
+
- features/step_definitions/parse_steps.rb
|
211
|
+
- features/step_definitions/query_steps.rb
|
212
|
+
- features/step_definitions/ucsc_bin_steps.rb
|
213
|
+
- features/support/env.rb
|
214
|
+
- features/ucsc-bins.feature
|
215
|
+
- spec/bio/maf/index_spec.rb
|
216
|
+
- spec/bio/maf/parser_spec.rb
|
217
|
+
- spec/bio/maf/struct_spec.rb
|
218
|
+
- spec/bio/maf/tiler_spec.rb
|
219
|
+
- spec/spec_helper.rb
|
220
|
+
- test/data/big-block.maf
|
221
|
+
- test/data/chr22_ieq.maf
|
222
|
+
- test/data/chrY-1block.maf
|
223
|
+
- test/data/empty
|
224
|
+
- test/data/empty.db
|
225
|
+
- test/data/gap-sp1.fa
|
226
|
+
- test/data/mm8_chr7_tiny.kct
|
227
|
+
- test/data/mm8_chr7_tiny.maf
|
228
|
+
- test/data/mm8_mod_a.maf
|
229
|
+
- test/data/mm8_single.maf
|
230
|
+
- test/data/mm8_subset_a.maf
|
231
|
+
- test/data/t1-bad1.maf
|
232
|
+
- test/data/t1.fasta
|
233
|
+
- test/data/t1.maf
|
234
|
+
- test/data/t1a.maf
|
235
|
+
- test/helper.rb
|
236
|
+
- test/test_bio-maf.rb
|