bio-maf 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +147 -113
- data/bin/maf_count +0 -1
- data/bin/maf_dump_blocks +0 -1
- data/bin/maf_extract +177 -0
- data/bin/maf_index +15 -8
- data/bin/maf_tile +2 -0
- data/bin/maf_to_fasta +4 -7
- data/bio-maf.gemspec +3 -4
- data/features/maf-indexing.feature +21 -1
- data/features/step_definitions/convert_steps.rb +2 -7
- data/features/step_definitions/index_steps.rb +4 -0
- data/lib/bio-maf.rb +5 -0
- data/lib/bio/maf/index.rb +33 -23
- data/lib/bio/maf/maf.rb +10 -7
- data/lib/bio/maf/parser.rb +37 -15
- data/lib/bio/maf/tiler.rb +60 -8
- data/lib/bio/maf/writer.rb +26 -0
- data/man/maf_extract.1 +159 -0
- data/man/maf_extract.1.ronn +175 -0
- data/man/maf_index.1 +21 -10
- data/man/maf_index.1.ronn +14 -7
- data/man/maf_tile.1 +12 -0
- data/man/maf_tile.1.ronn +9 -0
- data/spec/bio/maf/index_spec.rb +23 -0
- metadata +15 -11
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -81,43 +81,53 @@ create one with [maf_index(1)][], like so:
|
|
81
81
|
|
82
82
|
|
83
83
|
$ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
|
84
|
-
|
85
|
-
Or programmatically:
|
86
84
|
|
87
|
-
|
88
|
-
|
89
|
-
|
85
|
+
To index all sequences for searching, not just the reference sequence:
|
86
|
+
|
87
|
+
$ maf_index --all test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
|
88
|
+
|
89
|
+
To build an index programmatically:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
require 'bio-maf'
|
93
|
+
parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
|
94
|
+
idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
|
95
|
+
```
|
90
96
|
|
91
97
|
### Extract blocks from an indexed MAF file, by genomic interval
|
92
98
|
|
93
99
|
Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
```ruby
|
102
|
+
require 'bio-maf'
|
103
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
97
104
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
105
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
106
|
+
access.find(q) do |block|
|
107
|
+
ref_seq = block.sequences[0]
|
108
|
+
puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
|
109
|
+
end
|
103
110
|
|
104
|
-
|
105
|
-
|
111
|
+
# => Matched block at 80082592, 121 bases
|
112
|
+
# => Matched block at 80082713, 54 bases
|
113
|
+
```
|
106
114
|
|
107
115
|
Or, equivalently, one can work with a specific MAF file and index directly:
|
108
116
|
|
109
|
-
|
110
|
-
|
111
|
-
|
117
|
+
```ruby
|
118
|
+
require 'bio-maf'
|
119
|
+
parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
|
120
|
+
idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
|
112
121
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
122
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
123
|
+
idx.find(q, parser).each do |block|
|
124
|
+
ref_seq = block.sequences[0]
|
125
|
+
puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
|
126
|
+
end
|
118
127
|
|
119
|
-
|
120
|
-
|
128
|
+
# => Matched block at 80082592, 121 bases
|
129
|
+
# => Matched block at 80082713, 54 bases
|
130
|
+
```
|
121
131
|
|
122
132
|
### Extract alignment blocks truncated to a given interval
|
123
133
|
|
@@ -125,25 +135,29 @@ Given a genomic interval of interest, one can also extract only the
|
|
125
135
|
subsets of blocks that intersect with that interval, using the
|
126
136
|
`#slice` method like so:
|
127
137
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
138
|
+
```ruby
|
139
|
+
require 'bio-maf'
|
140
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
141
|
+
int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
|
142
|
+
blocks = access.slice(int).to_a
|
143
|
+
puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
|
144
|
+
# => Got 2 blocks, first 18 base pairs.
|
145
|
+
```
|
134
146
|
|
135
147
|
### Filter species returned in alignment blocks
|
136
148
|
|
137
|
-
|
138
|
-
|
149
|
+
```ruby
|
150
|
+
require 'bio-maf'
|
151
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
139
152
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
153
|
+
access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
|
154
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
155
|
+
blocks = access.find(q)
|
156
|
+
block = blocks.first
|
157
|
+
puts "Block has #{block.sequences.size} sequences."
|
145
158
|
|
146
|
-
|
159
|
+
# => Block has 3 sequences.
|
160
|
+
```
|
147
161
|
|
148
162
|
### Extract blocks matching certain conditions
|
149
163
|
|
@@ -154,68 +168,80 @@ See also the [Cucumber feature][] and [step definitions][] for this.
|
|
154
168
|
|
155
169
|
#### Match only blocks with all specified species
|
156
170
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
171
|
+
```ruby
|
172
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
173
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
|
174
|
+
access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
|
175
|
+
n_blocks = access.find(q).count
|
176
|
+
# => 1
|
177
|
+
```
|
162
178
|
|
163
179
|
#### Match only blocks with a certain number of sequences
|
164
180
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
181
|
+
```ruby
|
182
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
183
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
|
184
|
+
access.block_filter = { :at_least_n_sequences => 6 }
|
185
|
+
n_blocks = access.find(q).count
|
186
|
+
# => 1
|
187
|
+
```
|
170
188
|
|
171
189
|
#### Match only blocks within a text size range
|
172
190
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
191
|
+
```ruby
|
192
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
193
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
|
194
|
+
access.block_filter = { :min_size => 72, :max_size => 160 }
|
195
|
+
n_blocks = access.find(q).count
|
196
|
+
# => 3
|
197
|
+
```
|
178
198
|
|
179
199
|
### Process each block in a MAF file
|
180
200
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
201
|
+
```ruby
|
202
|
+
require 'bio-maf'
|
203
|
+
p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
|
204
|
+
puts "MAF version: #{p.header.version}"
|
205
|
+
# => MAF version: 1
|
185
206
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
207
|
+
p.each_block do |block|
|
208
|
+
block.sequences.each do |seq|
|
209
|
+
do_something(seq)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
```
|
191
213
|
|
192
214
|
### Parse empty ('e') lines
|
193
215
|
|
194
216
|
Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
|
195
217
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
218
|
+
```ruby
|
219
|
+
require 'bio-maf'
|
220
|
+
p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
|
221
|
+
:parse_empty => false)
|
222
|
+
block = p.parse_block
|
223
|
+
block.sequences.size
|
224
|
+
# => 3
|
225
|
+
|
226
|
+
p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
|
227
|
+
:parse_empty => true)
|
228
|
+
block = p.parse_block
|
229
|
+
block.sequences.size
|
230
|
+
# => 4
|
231
|
+
block.sequences.find { |s| s.empty? }
|
232
|
+
# => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
|
233
|
+
# @source="turTru1.scaffold_109008", @start=25049,
|
234
|
+
# @size=1601, @strand=:+, @src_size=50103, @text=nil,
|
235
|
+
# @status="I">
|
236
|
+
```
|
213
237
|
|
214
238
|
Such options can also be set on a Bio::MAF::Access object:
|
215
239
|
|
216
|
-
|
217
|
-
|
218
|
-
|
240
|
+
```ruby
|
241
|
+
require 'bio-maf'
|
242
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
243
|
+
access.parse_options[:parse_empty] = true
|
244
|
+
```
|
219
245
|
|
220
246
|
### Remove gaps from parsed blocks
|
221
247
|
|
@@ -225,9 +251,11 @@ gaps may be left where there was an insertion present only in
|
|
225
251
|
sequences that were filtered out. Such gaps can be removed by setting
|
226
252
|
the `:remove_gaps` parser option:
|
227
253
|
|
228
|
-
|
229
|
-
|
230
|
-
|
254
|
+
```ruby
|
255
|
+
require 'bio-maf'
|
256
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
257
|
+
access.parse_options[:remove_gaps] = true
|
258
|
+
```
|
231
259
|
|
232
260
|
### Join blocks after filtering together
|
233
261
|
|
@@ -235,9 +263,11 @@ Similarly, filtering out species may remove a species which had caused
|
|
235
263
|
two adjacent alignment blocks to be split. By enabling the
|
236
264
|
`:join_blocks` parser option, such blocks can be joined together:
|
237
265
|
|
238
|
-
|
239
|
-
|
240
|
-
|
266
|
+
```ruby
|
267
|
+
require 'bio-maf'
|
268
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
269
|
+
access.parse_options[:join_blocks] = true
|
270
|
+
```
|
241
271
|
|
242
272
|
See the [Cucumber feature][] for more details.
|
243
273
|
|
@@ -254,14 +284,16 @@ more.
|
|
254
284
|
[Bio::BioAlignment::Alignment]: http://rdoc.info/gems/bio-alignment/Bio/BioAlignment/Alignment
|
255
285
|
[bio-alignment]: https://github.com/pjotrp/bioruby-alignment
|
256
286
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
287
|
+
```ruby
|
288
|
+
require 'bio-maf'
|
289
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
290
|
+
access.parse_options[:as_bio_alignment] = true
|
291
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
292
|
+
access.find(q) do |aln|
|
293
|
+
col = aln.columns[3]
|
294
|
+
puts "bases in column 3: #{col}"
|
295
|
+
end
|
296
|
+
```
|
265
297
|
|
266
298
|
### Tile blocks together over an interval
|
267
299
|
|
@@ -276,23 +308,25 @@ man page.
|
|
276
308
|
|
277
309
|
[feature]: https://github.com/csw/bioruby-maf/blob/master/features/tiling.feature
|
278
310
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
311
|
+
```ruby
|
312
|
+
require 'bio-maf'
|
313
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
314
|
+
interval = Bio::GenomicInterval.zero_based('mm8.chr7',
|
315
|
+
80082334,
|
316
|
+
80082468)
|
317
|
+
access.tile(interval) do |tiler|
|
318
|
+
# reference is optional
|
319
|
+
tiler.reference = 'reference.fa.gz'
|
320
|
+
tiler.species = %w(mm8 rn4 hg18)
|
321
|
+
# species_map is optional
|
322
|
+
tiler.species_map = {
|
323
|
+
'mm8' => 'mouse',
|
324
|
+
'rn4' => 'rat',
|
325
|
+
'hg18' => 'human'
|
326
|
+
}
|
327
|
+
tiler.write_fasta($stdout)
|
328
|
+
end
|
329
|
+
```
|
296
330
|
|
297
331
|
### Command line tools
|
298
332
|
|
data/bin/maf_count
CHANGED
data/bin/maf_dump_blocks
CHANGED
data/bin/maf_extract
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bio-maf'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
|
7
|
+
include Bio::MAF
|
8
|
+
|
9
|
+
options = OpenStruct.new
|
10
|
+
options.mode = :intersect
|
11
|
+
options.format = :maf
|
12
|
+
options.seq_filter = {}
|
13
|
+
options.block_filter = {}
|
14
|
+
options.parse_options = {}
|
15
|
+
|
16
|
+
def handle_list_spec(spec)
|
17
|
+
if spec =~ /^@(.+)/
|
18
|
+
File.read($1).split
|
19
|
+
else
|
20
|
+
spec.split(',')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def handle_interval_spec(int)
|
25
|
+
parts = int.split(':')
|
26
|
+
Bio::GenomicInterval.zero_based(parts[0], parts[1].to_i, parts[2].to_i)
|
27
|
+
end
|
28
|
+
|
29
|
+
$op = OptionParser.new do |opts|
|
30
|
+
opts.banner = "Usage: maf_extract (-m MAF [-i INDEX] | -d MAFDIR) [options]"
|
31
|
+
opts.separator ""
|
32
|
+
opts.separator "MAF source options (either --maf or --maf-dir must be given):"
|
33
|
+
opts.on("-m", "--maf MAF", "MAF file") do |maf|
|
34
|
+
options.maf = maf
|
35
|
+
end
|
36
|
+
opts.on("-i", "--index INDEX", "MAF index") do |idx|
|
37
|
+
options.idx = idx
|
38
|
+
end
|
39
|
+
opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
|
40
|
+
options.maf_dir = dir
|
41
|
+
end
|
42
|
+
opts.separator ""
|
43
|
+
opts.separator "Extraction options:"
|
44
|
+
opts.on("--mode MODE", [:intersect, :slice],
|
45
|
+
"Extraction mode; 'intersect' to match ",
|
46
|
+
"blocks intersecting the given region,",
|
47
|
+
"or 'slice' to extract subsets covering ",
|
48
|
+
"given regions") do |mode|
|
49
|
+
options.mode = mode
|
50
|
+
end
|
51
|
+
opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
|
52
|
+
options.bed = bed
|
53
|
+
end
|
54
|
+
opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
|
55
|
+
options.interval = handle_interval_spec(int)
|
56
|
+
end
|
57
|
+
opts.separator ""
|
58
|
+
opts.separator "Output options:"
|
59
|
+
opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
|
60
|
+
options.format = fmt
|
61
|
+
end
|
62
|
+
opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
|
63
|
+
options.out_path = out
|
64
|
+
end
|
65
|
+
opts.separator ""
|
66
|
+
opts.separator "Filtering options:"
|
67
|
+
opts.on("--only-species SPECIES",
|
68
|
+
"Filter out all but the species in the",
|
69
|
+
"given comma-separated list",
|
70
|
+
"(or @FILE to read from a file)") do |spec|
|
71
|
+
options.seq_filter[:only_species] = handle_list_spec(spec)
|
72
|
+
end
|
73
|
+
opts.on("--with-all-species SPECIES",
|
74
|
+
"Only match blocks with all the given",
|
75
|
+
"species, comma-separated",
|
76
|
+
"(or @FILE to read from a file)") do |spec|
|
77
|
+
options.block_filter[:with_all_species] = handle_list_spec(spec)
|
78
|
+
end
|
79
|
+
opts.on("--min-sequences N", Integer,
|
80
|
+
"Match only blocks with at least N sequences") do |n|
|
81
|
+
options.block_filter[:at_least_n_sequences] = n
|
82
|
+
end
|
83
|
+
opts.on("--min-text-size N", Integer,
|
84
|
+
"Match only blocks with minimum text size N") do |n|
|
85
|
+
options.block_filter[:min_size] = n
|
86
|
+
end
|
87
|
+
opts.on("--max-text-size N", Integer,
|
88
|
+
"Match only blocks with maximum text size N") do |n|
|
89
|
+
options.block_filter[:max_size] = n
|
90
|
+
end
|
91
|
+
opts.separator ""
|
92
|
+
opts.separator "Block processing options:"
|
93
|
+
opts.on("--join-blocks",
|
94
|
+
"Join blocks if appropriate after filtering",
|
95
|
+
"out sequences") do
|
96
|
+
options.parse_options[:join_blocks] = true
|
97
|
+
end
|
98
|
+
opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
|
99
|
+
options.parse_options[:remove_gaps] = true
|
100
|
+
end
|
101
|
+
opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
|
102
|
+
options.parse_options[:parse_extended] = true
|
103
|
+
end
|
104
|
+
opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
|
105
|
+
options.parse_options[:parse_empty] = true
|
106
|
+
end
|
107
|
+
opts.separator ""
|
108
|
+
opts.separator "Logging options:"
|
109
|
+
Bio::MAF::handle_logging_options(opts)
|
110
|
+
end
|
111
|
+
$op.parse!(ARGV)
|
112
|
+
Bio::Log::CLI.configure('bio-maf')
|
113
|
+
|
114
|
+
def usage(msg)
|
115
|
+
$stderr.puts msg
|
116
|
+
$stderr.puts $op
|
117
|
+
exit 2
|
118
|
+
end
|
119
|
+
|
120
|
+
if options.maf
|
121
|
+
access = Access.file(options.maf, options.idx, options.parse_options)
|
122
|
+
elsif options.maf_dir
|
123
|
+
access = Access.maf_dir(options.maf_dir, options.parse_options)
|
124
|
+
else
|
125
|
+
usage "Must supply --maf or --maf-dir!"
|
126
|
+
end
|
127
|
+
|
128
|
+
begin
|
129
|
+
access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
|
130
|
+
access.block_filter = options.block_filter unless options.block_filter.empty?
|
131
|
+
if options.out_path
|
132
|
+
outf = File.open(options.out_path, 'w')
|
133
|
+
else
|
134
|
+
outf = $stdout
|
135
|
+
end
|
136
|
+
|
137
|
+
case options.format
|
138
|
+
when :maf
|
139
|
+
writer = Writer.new(outf)
|
140
|
+
when :fasta
|
141
|
+
writer = FASTAWriter.new(outf)
|
142
|
+
else
|
143
|
+
raise "unsupported output format #{format}!"
|
144
|
+
end
|
145
|
+
|
146
|
+
if options.bed
|
147
|
+
intervals = read_bed_intervals(options.bed)
|
148
|
+
elsif options.interval
|
149
|
+
intervals = [options.interval]
|
150
|
+
else
|
151
|
+
usage "Must supply --interval or --bed!"
|
152
|
+
end
|
153
|
+
|
154
|
+
# TODO: provide access to original MAF header?
|
155
|
+
if options.format == :maf
|
156
|
+
writer.write_header(Header.default)
|
157
|
+
end
|
158
|
+
|
159
|
+
case options.mode
|
160
|
+
when :intersect
|
161
|
+
access.find(intervals) do |block|
|
162
|
+
writer.write_block(block)
|
163
|
+
end
|
164
|
+
when :slice
|
165
|
+
# TODO: multiple files if intervals.size > 1?
|
166
|
+
intervals.each do |interval|
|
167
|
+
access.slice(interval) do |block|
|
168
|
+
writer.write_block(block)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
else
|
172
|
+
raise "Unsupported mode #{options.mode}!"
|
173
|
+
end
|
174
|
+
|
175
|
+
ensure
|
176
|
+
access.close
|
177
|
+
end
|