bio-maf 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ maf_tile(1) -- synthesize an alignment for a given region
2
+ =========================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
7
+
8
+ `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
9
+
10
+ ## DESCRIPTION
11
+
12
+ **maf_tile** takes a MAF file with index (generated by maf_index(1)),
13
+ extracts alignment blocks overlapping the given genomic interval, and
14
+ constructs a single alignment block covering the entire interval for
15
+ the specified species. Optionally, any gaps in coverage of the MAF
16
+ file's reference sequence can be filled in from a FASTA sequence file.
17
+
18
+ If a single interval is specified, the output will be written to
19
+ stdout in FASTA format. If the `--output-base` option is specified,
20
+ `_<start>:<end>.fa` will be appended to the given <base> parameter and
21
+ used to construct the output path. If a BED file is specified with
22
+ `--bed`, `--output-base` is also required.
23
+
24
+ Species can be renamed for output by specifying them as SPECIES:NAME;
25
+ the first component will be used to select the species from the MAF
26
+ file, and the second will be used in the FASTA description line for
27
+ output.
28
+
29
+ ## OPTIONS
30
+
31
+ * `-r`, `--reference SEQ`:
32
+ The FASTA reference sequence file given, which may be gzipped, will
33
+ be used to fill in any gaps between alignment blocks.
34
+
35
+ * `-i`, `--interval BEGIN:END`:
36
+ The given zero-based genomic interval will be used to select
37
+ alignment blocks from the MAF file.
38
+
39
+ * `-s`, `--species SPECIES[:NAME]`:
40
+ The given species will be selected for output. If given as
41
+ `species:name`, it will appear in the FASTA output as <name>.
42
+
43
+ * `-b`, `--bed BED`:
44
+ The given BED file will be used to provide a list of intervals to
45
+ process. If present, `--interval` will be ignored and
46
+ `--output-base` must be given as well.
47
+
48
+ * `-o`, `--output-base BASE`:
49
+ The given path will be used as the base name for output files, as
50
+ described above.
51
+
52
+ ## EXAMPLES
53
+
54
+ Generate an alignment of the `hg19`, `petMar1`, and `ornAna1`
55
+ sequences from `chrY.maf` over the interval 14400 to 15000 on the
56
+ reference sequence of the MAF file. Fills in gaps from
57
+ `chrY.refseq.fa.gz`. Writes FASTA output to stdout.
58
+
59
+ $ maf_tile --reference ~/maf/chrY.refseq.fa.gz \
60
+ --interval 14400:15000 \
61
+ -s hg19:human -s petMar1 -s ornAna1 \
62
+ chrY.maf chrY.kct
63
+ >human
64
+ GGGTGACGAAAAGAGCCGA-----[...]
65
+ >petMar1
66
+ gagtgccggggagtgccggggagt[...]
67
+ >ornAna1
68
+ AGGGATCTGGGAATTCTGG-----[...]
69
+
70
+ Write out a FASTA file for each interval in the given BED file,
71
+ prefixed with `/tmp/mm8`, and without filling in data from a reference
72
+ sequence:
73
+
74
+ $ maf_tile --bed /tmp/mm8.bed --output-base /tmp/mm8 \
75
+ -s mm8:mouse -s rn4:rat -s hg18:human \
76
+ mm8_chr7_tiny.maf mm8_chr7_tiny.kct
77
+
78
+ ## FILES
79
+
80
+ The output is generated in FASTA format, with one sequence per
81
+ species.
82
+
83
+ The input <maf> file must be a Multiple Alignment Format file.
84
+
85
+ The <index> must be a MAF index built with maf_index(1).
86
+
87
+ If `--bed` <bed> is specified, its argument must be a BED file. Only
88
+ the second and third columns will be used, to specify the zero-based
89
+ start and end positions of intervals.
90
+
91
+ ## ENVIRONMENT
92
+
93
+ `maf_tile` is a Ruby program and relies on ordinary Ruby environment
94
+ variables.
95
+
96
+ ## COPYRIGHT
97
+
98
+ `maf_tile` is copyright (C) 2012 Clayton Wheeler.
99
+
100
+ ## SEE ALSO
101
+
102
+ maf_index(1), ruby(1)
103
+
104
+ * <https://github.com/csw/bioruby-maf/>
@@ -228,6 +228,7 @@ module Bio
228
228
  @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
229
229
  @block = @p.parse_block
230
230
  @idx = KyotoIndex.new('%')
231
+ @idx.ref_seq = 'mm8.chr7'
231
232
  end
232
233
  context "single ref seq" do
233
234
  before(:each) do
@@ -25,6 +25,99 @@ module Bio
25
25
  it "provides arbitrary parameters"
26
26
  end
27
27
 
28
+ describe Block do
29
+ describe "#find_gaps" do
30
+ it "finds a single 14-base gap" do
31
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
32
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
33
+ block = p.parse_block
34
+ gaps = block.find_gaps
35
+ gaps.size.should == 1
36
+ gaps[0][0].should == 34
37
+ gaps[0][1].should == 14
38
+ end
39
+ end
40
+ describe "#remove_gaps!" do
41
+ it "removes a single 14-base gap" do
42
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
43
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
44
+ block = p.parse_block
45
+ block.sequences.size.should == 5
46
+ block.text_size.should == 54
47
+ block.remove_gaps!
48
+ block.text_size.should == 40
49
+ end
50
+ end
51
+ end
52
+
53
+ describe Sequence do
54
+ before(:each) do
55
+ @parser = DummyParser.new
56
+ end
57
+
58
+ describe "#gapped?" do
59
+ it "is false for sequences with no gaps" do
60
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
61
+ s = @parser.parse_seq_line(line, nil)
62
+ s.gapped?.should be_false
63
+ end
64
+ it "is true for sequences with gaps" do
65
+ line = "s human_unc 9077 8 + 10998 AC-AGTATT"
66
+ s = @parser.parse_seq_line(line, nil)
67
+ s.gapped?.should be_true
68
+ end
69
+ end
70
+
71
+ describe "#text_range" do
72
+ it "returns 0...text.size for a spanning interval" do
73
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
74
+ s = @parser.parse_seq_line(line, nil)
75
+ range = s.text_range(9077...(9077 + 8))
76
+ range.should == (0...(s.text.size))
77
+ end
78
+ it "returns 0...text.size for a gapped spanning interval" do
79
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
80
+ s = @parser.parse_seq_line(line, nil)
81
+ range = s.text_range(9077...(9077 + 8))
82
+ range.should == (0...(s.text.size))
83
+ end
84
+ it "handles a leading subset" do
85
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
86
+ s = @parser.parse_seq_line(line, nil)
87
+ range = s.text_range(9077...(9077 + 2))
88
+ range.should == (0...2)
89
+ end
90
+ it "handles a trailing subset" do
91
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
92
+ s = @parser.parse_seq_line(line, nil)
93
+ range = s.text_range(9079...9085)
94
+ range.should == (2...8)
95
+ end
96
+ it "handles a gap in the middle" do
97
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
98
+ s = @parser.parse_seq_line(line, nil)
99
+ range = s.text_range(9078...(9077 + 8))
100
+ range.should == (1...(s.text.size))
101
+ end
102
+ it "errors on a range starting before" do
103
+ expect {
104
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
105
+ s = @parser.parse_seq_line(line, nil)
106
+ range = s.text_range(9076...(9077 + 8))
107
+ }.to raise_error
108
+ end
109
+ it "errors on a range ending after" do
110
+ expect {
111
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
112
+ s = @parser.parse_seq_line(line, nil)
113
+ range = s.text_range(9076...(9077 + 9))
114
+ }.to raise_error
115
+ end
116
+
117
+ end
118
+
119
+ end
120
+
28
121
  describe ParseContext do
29
122
  it "tracks the last block position"
30
123
  end
@@ -206,6 +299,16 @@ module Bio
206
299
  @p.sequence_filter = { :only_species => %w(mm8 hg18) }
207
300
  @p.parse_block.sequences.size.should == 2
208
301
  end
302
+ it "sets filtered? when modified" do
303
+ @p.sequence_filter = { :only_species => %w(mm8 rn4) }
304
+ @p.parse_block.filtered?.should be_true
305
+ end
306
+ it "does not set filtered? when unmodified" do
307
+ @p.sequence_filter = {
308
+ :only_species => %w(mm8 rn4 oryCun1 hg18 hg181)
309
+ }
310
+ @p.parse_block.filtered?.should be_false
311
+ end
209
312
  end
210
313
 
211
314
  context "at end of file" do
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio::MAF
4
+
5
+ describe Tiler do
6
+
7
+ describe "#runs" do
8
+ it "returns a uniform run properly" do
9
+ a = Array.new(10, 'a')
10
+ runs = Tiler.new.enum_for(:runs, a).to_a
11
+ runs.should == [[0...10, 'a']]
12
+ end
13
+ it "yields a trailing item" do
14
+ a = Array.new(10, 'a')
15
+ a.fill('b', 8...10)
16
+ runs = Tiler.new.enum_for(:runs, a).to_a
17
+ runs.should == [[0...8, 'a'], [8...10, 'b']]
18
+ end
19
+ it "handles mixed contents" do
20
+ spec = [[0...2, 'a'],
21
+ [2...3, 'b'],
22
+ [3...4, 'c'],
23
+ [4...7, 'd']]
24
+ a = Array.new(7, nil)
25
+ spec.each { |range, obj| a.fill(obj, range) }
26
+ runs = Tiler.new.enum_for(:runs, a).to_a
27
+ runs.should == spec
28
+ end
29
+ it "handles overwrites" do
30
+ spec = [[0...7, 'a'],
31
+ [2...5, 'b'],
32
+ [3...4, 'c'],
33
+ [4...7, 'd']]
34
+ a = Array.new(7, nil)
35
+ spec.each { |range, obj| a.fill(obj, range) }
36
+ runs = Tiler.new.enum_for(:runs, a).to_a
37
+ runs.should == [[0...2, 'a'],
38
+ [2...3, 'b'],
39
+ [3...4, 'c'],
40
+ [4...7, 'd']]
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ describe FASTARangeReader do
47
+ describe "#read" do
48
+ before(:each) do
49
+ @r = FASTARangeReader.new('test/data/gap-sp1.fa')
50
+ @s = 'CCAGGATGCTGGGCTGAGGGCAGTTGTGTCAGGGCGGTCCGGTGCAGGCA'
51
+ end
52
+
53
+ def check_range(z_start, z_end)
54
+ @r.read_interval(z_start, z_end).should == @s.slice(z_start...z_end)
55
+ end
56
+
57
+ it "returns the entire sequence" do
58
+ check_range(0, 50)
59
+ end
60
+ it "returns an entire line" do
61
+ check_range(10, 20)
62
+ end
63
+ it "returns arbitrary components" do
64
+ check_range(17, 41)
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,6 @@
1
+ > sp1.chr1
2
+ CCAGGATGCT
3
+ GGGCTGAGGG
4
+ CAGTTGTGTC
5
+ AGGGCGGTCC
6
+ GGTGCAGGCA
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-maf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -62,11 +62,13 @@ dependencies:
62
62
  description: Multiple Alignment Format parser for BioRuby.
63
63
  email: cswh@umich.edu
64
64
  executables:
65
+ - find_overlaps
65
66
  - maf_count
66
67
  - maf_dump_blocks
67
68
  - maf_extract_ranges_count
68
69
  - maf_index
69
70
  - maf_parse_bench
71
+ - maf_tile
70
72
  - maf_to_fasta
71
73
  - maf_write
72
74
  - random_ranges
@@ -76,6 +78,7 @@ extra_rdoc_files:
76
78
  - README.md
77
79
  files:
78
80
  - .document
81
+ - .gitignore
79
82
  - .simplecov
80
83
  - .travis.yml
81
84
  - .yardopts
@@ -90,20 +93,27 @@ files:
90
93
  - benchmarks/read_bench
91
94
  - benchmarks/sort_bench
92
95
  - benchmarks/split_bench
96
+ - bin/find_overlaps
93
97
  - bin/maf_count
94
98
  - bin/maf_dump_blocks
95
99
  - bin/maf_extract_ranges_count
96
100
  - bin/maf_index
97
101
  - bin/maf_parse_bench
102
+ - bin/maf_tile
98
103
  - bin/maf_to_fasta
99
104
  - bin/maf_write
100
105
  - bin/random_ranges
106
+ - bio-maf.gemspec
107
+ - features/gap-filling.feature
108
+ - features/gap-removal.feature
101
109
  - features/maf-indexing.feature
102
110
  - features/maf-output.feature
103
111
  - features/maf-parsing.feature
104
112
  - features/maf-querying.feature
105
113
  - features/maf-to-fasta.feature
106
114
  - features/step_definitions/convert_steps.rb
115
+ - features/step_definitions/gap-filling_steps.rb
116
+ - features/step_definitions/gap_removal_steps.rb
107
117
  - features/step_definitions/index_steps.rb
108
118
  - features/step_definitions/output_steps.rb
109
119
  - features/step_definitions/parse_steps.rb
@@ -115,8 +125,10 @@ files:
115
125
  - lib/bio-maf/maf.rb
116
126
  - lib/bio/maf.rb
117
127
  - lib/bio/maf/index.rb
128
+ - lib/bio/maf/maf.rb
118
129
  - lib/bio/maf/parser.rb
119
130
  - lib/bio/maf/struct.rb
131
+ - lib/bio/maf/tiler.rb
120
132
  - lib/bio/maf/writer.rb
121
133
  - lib/bio/ucsc.rb
122
134
  - lib/bio/ucsc/genomic-interval-bin.rb
@@ -125,17 +137,21 @@ files:
125
137
  - man/maf_index.1
126
138
  - man/maf_index.1.markdown
127
139
  - man/maf_index.1.ronn
140
+ - man/maf_tile.1
141
+ - man/maf_tile.1.ronn
128
142
  - man/maf_to_fasta.1
129
143
  - man/maf_to_fasta.1.ronn
130
144
  - spec/bio/maf/index_spec.rb
131
145
  - spec/bio/maf/parser_spec.rb
132
146
  - spec/bio/maf/struct_spec.rb
147
+ - spec/bio/maf/tiler_spec.rb
133
148
  - spec/spec_helper.rb
134
149
  - test/data/big-block.maf
135
150
  - test/data/chr22_ieq.maf
136
151
  - test/data/chrY-1block.maf
137
152
  - test/data/empty
138
153
  - test/data/empty.db
154
+ - test/data/gap-sp1.fa
139
155
  - test/data/mm8_chr7_tiny.kct
140
156
  - test/data/mm8_chr7_tiny.maf
141
157
  - test/data/mm8_mod_a.maf
@@ -165,7 +181,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
165
181
  version: '0'
166
182
  segments:
167
183
  - 0
168
- hash: -2212073295752934712
184
+ hash: 882426960615932212
169
185
  required_rubygems_version: !ruby/object:Gem::Requirement
170
186
  none: false
171
187
  requirements:
@@ -178,4 +194,43 @@ rubygems_version: 1.8.24
178
194
  signing_key:
179
195
  specification_version: 3
180
196
  summary: MAF parser for BioRuby
181
- test_files: []
197
+ test_files:
198
+ - features/gap-filling.feature
199
+ - features/gap-removal.feature
200
+ - features/maf-indexing.feature
201
+ - features/maf-output.feature
202
+ - features/maf-parsing.feature
203
+ - features/maf-querying.feature
204
+ - features/maf-to-fasta.feature
205
+ - features/step_definitions/convert_steps.rb
206
+ - features/step_definitions/gap-filling_steps.rb
207
+ - features/step_definitions/gap_removal_steps.rb
208
+ - features/step_definitions/index_steps.rb
209
+ - features/step_definitions/output_steps.rb
210
+ - features/step_definitions/parse_steps.rb
211
+ - features/step_definitions/query_steps.rb
212
+ - features/step_definitions/ucsc_bin_steps.rb
213
+ - features/support/env.rb
214
+ - features/ucsc-bins.feature
215
+ - spec/bio/maf/index_spec.rb
216
+ - spec/bio/maf/parser_spec.rb
217
+ - spec/bio/maf/struct_spec.rb
218
+ - spec/bio/maf/tiler_spec.rb
219
+ - spec/spec_helper.rb
220
+ - test/data/big-block.maf
221
+ - test/data/chr22_ieq.maf
222
+ - test/data/chrY-1block.maf
223
+ - test/data/empty
224
+ - test/data/empty.db
225
+ - test/data/gap-sp1.fa
226
+ - test/data/mm8_chr7_tiny.kct
227
+ - test/data/mm8_chr7_tiny.maf
228
+ - test/data/mm8_mod_a.maf
229
+ - test/data/mm8_single.maf
230
+ - test/data/mm8_subset_a.maf
231
+ - test/data/t1-bad1.maf
232
+ - test/data/t1.fasta
233
+ - test/data/t1.maf
234
+ - test/data/t1a.maf
235
+ - test/helper.rb
236
+ - test/test_bio-maf.rb