bio-maf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +53 -0
- data/DEVELOPMENT.md +29 -0
- data/Gemfile +1 -0
- data/README.md +69 -1
- data/Rakefile +4 -3
- data/bin/find_overlaps +21 -0
- data/bin/maf_tile +103 -0
- data/bio-maf.gemspec +43 -0
- data/features/gap-filling.feature +158 -0
- data/features/gap-removal.feature +50 -0
- data/features/step_definitions/gap-filling_steps.rb +32 -0
- data/features/step_definitions/gap_removal_steps.rb +19 -0
- data/features/step_definitions/parse_steps.rb +2 -1
- data/lib/bio/maf/index.rb +15 -8
- data/lib/bio/maf/maf.rb +267 -0
- data/lib/bio/maf/parser.rb +115 -175
- data/lib/bio/maf/tiler.rb +167 -0
- data/lib/bio/maf.rb +2 -0
- data/man/maf_tile.1 +108 -0
- data/man/maf_tile.1.ronn +104 -0
- data/spec/bio/maf/index_spec.rb +1 -0
- data/spec/bio/maf/parser_spec.rb +103 -0
- data/spec/bio/maf/tiler_spec.rb +69 -0
- data/test/data/gap-sp1.fa +6 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- metadata +58 -3
data/man/maf_tile.1.ronn
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
maf_tile(1) -- synthesize an alignment for a given region
|
2
|
+
=========================================================
|
3
|
+
|
4
|
+
## SYNOPSIS
|
5
|
+
|
6
|
+
`maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
|
7
|
+
|
8
|
+
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
|
9
|
+
|
10
|
+
## DESCRIPTION
|
11
|
+
|
12
|
+
**maf_tile** takes a MAF file with index (generated by maf_index(1)),
|
13
|
+
extracts alignment blocks overlapping the given genomic interval, and
|
14
|
+
constructs a single alignment block covering the entire interval for
|
15
|
+
the specified species. Optionally, any gaps in coverage of the MAF
|
16
|
+
file's reference sequence can be filled in from a FASTA sequence file.
|
17
|
+
|
18
|
+
If a single interval is specified, the output will be written to
|
19
|
+
stdout in FASTA format. If the `--output-base` option is specified,
|
20
|
+
`_<start>:<end>.fa` will be appended to the given <base> parameter and
|
21
|
+
used to construct the output path. If a BED file is specified with
|
22
|
+
`--bed`, `--output-base` is also required.
|
23
|
+
|
24
|
+
Species can be renamed for output by specifying them as SPECIES:NAME;
|
25
|
+
the first component will be used to select the species from the MAF
|
26
|
+
file, and the second will be used in the FASTA description line for
|
27
|
+
output.
|
28
|
+
|
29
|
+
## OPTIONS
|
30
|
+
|
31
|
+
* `-r`, `--reference SEQ`:
|
32
|
+
The FASTA reference sequence file given, which may be gzipped, will
|
33
|
+
be used to fill in any gaps between alignment blocks.
|
34
|
+
|
35
|
+
* `-i`, `--interval BEGIN:END`:
|
36
|
+
The given zero-based genomic interval will be used to select
|
37
|
+
alignment blocks from the MAF file.
|
38
|
+
|
39
|
+
* `-s`, `--species SPECIES[:NAME]`:
|
40
|
+
The given species will be selected for output. If given as
|
41
|
+
`species:name`, it will appear in the FASTA output as <name>.
|
42
|
+
|
43
|
+
* `-b`, `--bed BED`:
|
44
|
+
The given BED file will be used to provide a list of intervals to
|
45
|
+
process. If present, `--interval` will be ignored and
|
46
|
+
`--output-base` must be given as well.
|
47
|
+
|
48
|
+
* `-o`, `--output-base BASE`:
|
49
|
+
The given path will be used as the base name for output files, as
|
50
|
+
described above.
|
51
|
+
|
52
|
+
## EXAMPLES
|
53
|
+
|
54
|
+
Generate an alignment of the `hg19`, `petMar1`, and `ornAna1`
|
55
|
+
sequences from `chrY.maf` over the interval 14400 to 15000 on the
|
56
|
+
reference sequence of the MAF file. Fills in gaps from
|
57
|
+
`chrY.refseq.fa.gz`. Writes FASTA output to stdout.
|
58
|
+
|
59
|
+
$ maf_tile --reference ~/maf/chrY.refseq.fa.gz \
|
60
|
+
--interval 14400:15000 \
|
61
|
+
-s hg19:human -s petMar1 -s ornAna1 \
|
62
|
+
chrY.maf chrY.kct
|
63
|
+
>human
|
64
|
+
GGGTGACGAAAAGAGCCGA-----[...]
|
65
|
+
>petMar1
|
66
|
+
gagtgccggggagtgccggggagt[...]
|
67
|
+
>ornAna1
|
68
|
+
AGGGATCTGGGAATTCTGG-----[...]
|
69
|
+
|
70
|
+
Write out a FASTA file for each interval in the given BED file,
|
71
|
+
prefixed with `/tmp/mm8`, and without filling in data from a reference
|
72
|
+
sequence:
|
73
|
+
|
74
|
+
$ maf_tile --bed /tmp/mm8.bed --output-base /tmp/mm8 \
|
75
|
+
-s mm8:mouse -s rn4:rat -s hg18:human \
|
76
|
+
mm8_chr7_tiny.maf mm8_chr7_tiny.kct
|
77
|
+
|
78
|
+
## FILES
|
79
|
+
|
80
|
+
The output is generated in FASTA format, with one sequence per
|
81
|
+
species.
|
82
|
+
|
83
|
+
The input <maf> file must be a Multiple Alignment Format file.
|
84
|
+
|
85
|
+
The <index> must be a MAF index built with maf_index(1).
|
86
|
+
|
87
|
+
If `--bed` <bed> is specified, its argument must be a BED file. Only
|
88
|
+
the second and third columns will be used, to specify the zero-based
|
89
|
+
start and end positions of intervals.
|
90
|
+
|
91
|
+
## ENVIRONMENT
|
92
|
+
|
93
|
+
`maf_tile` is a Ruby program and relies on ordinary Ruby environment
|
94
|
+
variables.
|
95
|
+
|
96
|
+
## COPYRIGHT
|
97
|
+
|
98
|
+
`maf_tile` is copyright (C) 2012 Clayton Wheeler.
|
99
|
+
|
100
|
+
## SEE ALSO
|
101
|
+
|
102
|
+
maf_index(1), ruby(1)
|
103
|
+
|
104
|
+
* <https://github.com/csw/bioruby-maf/>
|
data/spec/bio/maf/index_spec.rb
CHANGED
data/spec/bio/maf/parser_spec.rb
CHANGED
@@ -25,6 +25,99 @@ module Bio
|
|
25
25
|
it "provides arbitrary parameters"
|
26
26
|
end
|
27
27
|
|
28
|
+
describe Block do
|
29
|
+
describe "#find_gaps" do
|
30
|
+
it "finds a single 14-base gap" do
|
31
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
32
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
33
|
+
block = p.parse_block
|
34
|
+
gaps = block.find_gaps
|
35
|
+
gaps.size.should == 1
|
36
|
+
gaps[0][0].should == 34
|
37
|
+
gaps[0][1].should == 14
|
38
|
+
end
|
39
|
+
end
|
40
|
+
describe "#remove_gaps!" do
|
41
|
+
it "removes a single 14-base gap" do
|
42
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
43
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
44
|
+
block = p.parse_block
|
45
|
+
block.sequences.size.should == 5
|
46
|
+
block.text_size.should == 54
|
47
|
+
block.remove_gaps!
|
48
|
+
block.text_size.should == 40
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe Sequence do
|
54
|
+
before(:each) do
|
55
|
+
@parser = DummyParser.new
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "#gapped?" do
|
59
|
+
it "is false for sequences with no gaps" do
|
60
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
61
|
+
s = @parser.parse_seq_line(line, nil)
|
62
|
+
s.gapped?.should be_false
|
63
|
+
end
|
64
|
+
it "is true for sequences with gaps" do
|
65
|
+
line = "s human_unc 9077 8 + 10998 AC-AGTATT"
|
66
|
+
s = @parser.parse_seq_line(line, nil)
|
67
|
+
s.gapped?.should be_true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe "#text_range" do
|
72
|
+
it "returns 0...text.size for a spanning interval" do
|
73
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
74
|
+
s = @parser.parse_seq_line(line, nil)
|
75
|
+
range = s.text_range(9077...(9077 + 8))
|
76
|
+
range.should == (0...(s.text.size))
|
77
|
+
end
|
78
|
+
it "returns 0...text.size for a gapped spanning interval" do
|
79
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
80
|
+
s = @parser.parse_seq_line(line, nil)
|
81
|
+
range = s.text_range(9077...(9077 + 8))
|
82
|
+
range.should == (0...(s.text.size))
|
83
|
+
end
|
84
|
+
it "handles a leading subset" do
|
85
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
86
|
+
s = @parser.parse_seq_line(line, nil)
|
87
|
+
range = s.text_range(9077...(9077 + 2))
|
88
|
+
range.should == (0...2)
|
89
|
+
end
|
90
|
+
it "handles a trailing subset" do
|
91
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
92
|
+
s = @parser.parse_seq_line(line, nil)
|
93
|
+
range = s.text_range(9079...9085)
|
94
|
+
range.should == (2...8)
|
95
|
+
end
|
96
|
+
it "handles a gap in the middle" do
|
97
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
98
|
+
s = @parser.parse_seq_line(line, nil)
|
99
|
+
range = s.text_range(9078...(9077 + 8))
|
100
|
+
range.should == (1...(s.text.size))
|
101
|
+
end
|
102
|
+
it "errors on a range starting before" do
|
103
|
+
expect {
|
104
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
105
|
+
s = @parser.parse_seq_line(line, nil)
|
106
|
+
range = s.text_range(9076...(9077 + 8))
|
107
|
+
}.to raise_error
|
108
|
+
end
|
109
|
+
it "errors on a range ending after" do
|
110
|
+
expect {
|
111
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
112
|
+
s = @parser.parse_seq_line(line, nil)
|
113
|
+
range = s.text_range(9076...(9077 + 9))
|
114
|
+
}.to raise_error
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
28
121
|
describe ParseContext do
|
29
122
|
it "tracks the last block position"
|
30
123
|
end
|
@@ -206,6 +299,16 @@ module Bio
|
|
206
299
|
@p.sequence_filter = { :only_species => %w(mm8 hg18) }
|
207
300
|
@p.parse_block.sequences.size.should == 2
|
208
301
|
end
|
302
|
+
it "sets filtered? when modified" do
|
303
|
+
@p.sequence_filter = { :only_species => %w(mm8 rn4) }
|
304
|
+
@p.parse_block.filtered?.should be_true
|
305
|
+
end
|
306
|
+
it "does not set filtered? when unmodified" do
|
307
|
+
@p.sequence_filter = {
|
308
|
+
:only_species => %w(mm8 rn4 oryCun1 hg18 hg181)
|
309
|
+
}
|
310
|
+
@p.parse_block.filtered?.should be_false
|
311
|
+
end
|
209
312
|
end
|
210
313
|
|
211
314
|
context "at end of file" do
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio::MAF
|
4
|
+
|
5
|
+
describe Tiler do
|
6
|
+
|
7
|
+
describe "#runs" do
|
8
|
+
it "returns a uniform run properly" do
|
9
|
+
a = Array.new(10, 'a')
|
10
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
11
|
+
runs.should == [[0...10, 'a']]
|
12
|
+
end
|
13
|
+
it "yields a trailing item" do
|
14
|
+
a = Array.new(10, 'a')
|
15
|
+
a.fill('b', 8...10)
|
16
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
17
|
+
runs.should == [[0...8, 'a'], [8...10, 'b']]
|
18
|
+
end
|
19
|
+
it "handles mixed contents" do
|
20
|
+
spec = [[0...2, 'a'],
|
21
|
+
[2...3, 'b'],
|
22
|
+
[3...4, 'c'],
|
23
|
+
[4...7, 'd']]
|
24
|
+
a = Array.new(7, nil)
|
25
|
+
spec.each { |range, obj| a.fill(obj, range) }
|
26
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
27
|
+
runs.should == spec
|
28
|
+
end
|
29
|
+
it "handles overwrites" do
|
30
|
+
spec = [[0...7, 'a'],
|
31
|
+
[2...5, 'b'],
|
32
|
+
[3...4, 'c'],
|
33
|
+
[4...7, 'd']]
|
34
|
+
a = Array.new(7, nil)
|
35
|
+
spec.each { |range, obj| a.fill(obj, range) }
|
36
|
+
runs = Tiler.new.enum_for(:runs, a).to_a
|
37
|
+
runs.should == [[0...2, 'a'],
|
38
|
+
[2...3, 'b'],
|
39
|
+
[3...4, 'c'],
|
40
|
+
[4...7, 'd']]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe FASTARangeReader do
|
47
|
+
describe "#read" do
|
48
|
+
before(:each) do
|
49
|
+
@r = FASTARangeReader.new('test/data/gap-sp1.fa')
|
50
|
+
@s = 'CCAGGATGCTGGGCTGAGGGCAGTTGTGTCAGGGCGGTCCGGTGCAGGCA'
|
51
|
+
end
|
52
|
+
|
53
|
+
def check_range(z_start, z_end)
|
54
|
+
@r.read_interval(z_start, z_end).should == @s.slice(z_start...z_end)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "returns the entire sequence" do
|
58
|
+
check_range(0, 50)
|
59
|
+
end
|
60
|
+
it "returns an entire line" do
|
61
|
+
check_range(10, 20)
|
62
|
+
end
|
63
|
+
it "returns arbitrary components" do
|
64
|
+
check_range(17, 41)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
data/test/data/mm8_chr7_tiny.kct
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-maf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -62,11 +62,13 @@ dependencies:
|
|
62
62
|
description: Multiple Alignment Format parser for BioRuby.
|
63
63
|
email: cswh@umich.edu
|
64
64
|
executables:
|
65
|
+
- find_overlaps
|
65
66
|
- maf_count
|
66
67
|
- maf_dump_blocks
|
67
68
|
- maf_extract_ranges_count
|
68
69
|
- maf_index
|
69
70
|
- maf_parse_bench
|
71
|
+
- maf_tile
|
70
72
|
- maf_to_fasta
|
71
73
|
- maf_write
|
72
74
|
- random_ranges
|
@@ -76,6 +78,7 @@ extra_rdoc_files:
|
|
76
78
|
- README.md
|
77
79
|
files:
|
78
80
|
- .document
|
81
|
+
- .gitignore
|
79
82
|
- .simplecov
|
80
83
|
- .travis.yml
|
81
84
|
- .yardopts
|
@@ -90,20 +93,27 @@ files:
|
|
90
93
|
- benchmarks/read_bench
|
91
94
|
- benchmarks/sort_bench
|
92
95
|
- benchmarks/split_bench
|
96
|
+
- bin/find_overlaps
|
93
97
|
- bin/maf_count
|
94
98
|
- bin/maf_dump_blocks
|
95
99
|
- bin/maf_extract_ranges_count
|
96
100
|
- bin/maf_index
|
97
101
|
- bin/maf_parse_bench
|
102
|
+
- bin/maf_tile
|
98
103
|
- bin/maf_to_fasta
|
99
104
|
- bin/maf_write
|
100
105
|
- bin/random_ranges
|
106
|
+
- bio-maf.gemspec
|
107
|
+
- features/gap-filling.feature
|
108
|
+
- features/gap-removal.feature
|
101
109
|
- features/maf-indexing.feature
|
102
110
|
- features/maf-output.feature
|
103
111
|
- features/maf-parsing.feature
|
104
112
|
- features/maf-querying.feature
|
105
113
|
- features/maf-to-fasta.feature
|
106
114
|
- features/step_definitions/convert_steps.rb
|
115
|
+
- features/step_definitions/gap-filling_steps.rb
|
116
|
+
- features/step_definitions/gap_removal_steps.rb
|
107
117
|
- features/step_definitions/index_steps.rb
|
108
118
|
- features/step_definitions/output_steps.rb
|
109
119
|
- features/step_definitions/parse_steps.rb
|
@@ -115,8 +125,10 @@ files:
|
|
115
125
|
- lib/bio-maf/maf.rb
|
116
126
|
- lib/bio/maf.rb
|
117
127
|
- lib/bio/maf/index.rb
|
128
|
+
- lib/bio/maf/maf.rb
|
118
129
|
- lib/bio/maf/parser.rb
|
119
130
|
- lib/bio/maf/struct.rb
|
131
|
+
- lib/bio/maf/tiler.rb
|
120
132
|
- lib/bio/maf/writer.rb
|
121
133
|
- lib/bio/ucsc.rb
|
122
134
|
- lib/bio/ucsc/genomic-interval-bin.rb
|
@@ -125,17 +137,21 @@ files:
|
|
125
137
|
- man/maf_index.1
|
126
138
|
- man/maf_index.1.markdown
|
127
139
|
- man/maf_index.1.ronn
|
140
|
+
- man/maf_tile.1
|
141
|
+
- man/maf_tile.1.ronn
|
128
142
|
- man/maf_to_fasta.1
|
129
143
|
- man/maf_to_fasta.1.ronn
|
130
144
|
- spec/bio/maf/index_spec.rb
|
131
145
|
- spec/bio/maf/parser_spec.rb
|
132
146
|
- spec/bio/maf/struct_spec.rb
|
147
|
+
- spec/bio/maf/tiler_spec.rb
|
133
148
|
- spec/spec_helper.rb
|
134
149
|
- test/data/big-block.maf
|
135
150
|
- test/data/chr22_ieq.maf
|
136
151
|
- test/data/chrY-1block.maf
|
137
152
|
- test/data/empty
|
138
153
|
- test/data/empty.db
|
154
|
+
- test/data/gap-sp1.fa
|
139
155
|
- test/data/mm8_chr7_tiny.kct
|
140
156
|
- test/data/mm8_chr7_tiny.maf
|
141
157
|
- test/data/mm8_mod_a.maf
|
@@ -165,7 +181,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
165
181
|
version: '0'
|
166
182
|
segments:
|
167
183
|
- 0
|
168
|
-
hash:
|
184
|
+
hash: 882426960615932212
|
169
185
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
170
186
|
none: false
|
171
187
|
requirements:
|
@@ -178,4 +194,43 @@ rubygems_version: 1.8.24
|
|
178
194
|
signing_key:
|
179
195
|
specification_version: 3
|
180
196
|
summary: MAF parser for BioRuby
|
181
|
-
test_files:
|
197
|
+
test_files:
|
198
|
+
- features/gap-filling.feature
|
199
|
+
- features/gap-removal.feature
|
200
|
+
- features/maf-indexing.feature
|
201
|
+
- features/maf-output.feature
|
202
|
+
- features/maf-parsing.feature
|
203
|
+
- features/maf-querying.feature
|
204
|
+
- features/maf-to-fasta.feature
|
205
|
+
- features/step_definitions/convert_steps.rb
|
206
|
+
- features/step_definitions/gap-filling_steps.rb
|
207
|
+
- features/step_definitions/gap_removal_steps.rb
|
208
|
+
- features/step_definitions/index_steps.rb
|
209
|
+
- features/step_definitions/output_steps.rb
|
210
|
+
- features/step_definitions/parse_steps.rb
|
211
|
+
- features/step_definitions/query_steps.rb
|
212
|
+
- features/step_definitions/ucsc_bin_steps.rb
|
213
|
+
- features/support/env.rb
|
214
|
+
- features/ucsc-bins.feature
|
215
|
+
- spec/bio/maf/index_spec.rb
|
216
|
+
- spec/bio/maf/parser_spec.rb
|
217
|
+
- spec/bio/maf/struct_spec.rb
|
218
|
+
- spec/bio/maf/tiler_spec.rb
|
219
|
+
- spec/spec_helper.rb
|
220
|
+
- test/data/big-block.maf
|
221
|
+
- test/data/chr22_ieq.maf
|
222
|
+
- test/data/chrY-1block.maf
|
223
|
+
- test/data/empty
|
224
|
+
- test/data/empty.db
|
225
|
+
- test/data/gap-sp1.fa
|
226
|
+
- test/data/mm8_chr7_tiny.kct
|
227
|
+
- test/data/mm8_chr7_tiny.maf
|
228
|
+
- test/data/mm8_mod_a.maf
|
229
|
+
- test/data/mm8_single.maf
|
230
|
+
- test/data/mm8_subset_a.maf
|
231
|
+
- test/data/t1-bad1.maf
|
232
|
+
- test/data/t1.fasta
|
233
|
+
- test/data/t1.maf
|
234
|
+
- test/data/t1a.maf
|
235
|
+
- test/helper.rb
|
236
|
+
- test/test_bio-maf.rb
|