bio-maf 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/README.md +147 -113
- data/bin/maf_count +0 -1
- data/bin/maf_dump_blocks +0 -1
- data/bin/maf_extract +177 -0
- data/bin/maf_index +15 -8
- data/bin/maf_tile +2 -0
- data/bin/maf_to_fasta +4 -7
- data/bio-maf.gemspec +3 -4
- data/features/maf-indexing.feature +21 -1
- data/features/step_definitions/convert_steps.rb +2 -7
- data/features/step_definitions/index_steps.rb +4 -0
- data/lib/bio-maf.rb +5 -0
- data/lib/bio/maf/index.rb +33 -23
- data/lib/bio/maf/maf.rb +10 -7
- data/lib/bio/maf/parser.rb +37 -15
- data/lib/bio/maf/tiler.rb +60 -8
- data/lib/bio/maf/writer.rb +26 -0
- data/man/maf_extract.1 +159 -0
- data/man/maf_extract.1.ronn +175 -0
- data/man/maf_index.1 +21 -10
- data/man/maf_index.1.ronn +14 -7
- data/man/maf_tile.1 +12 -0
- data/man/maf_tile.1.ronn +9 -0
- data/spec/bio/maf/index_spec.rb +23 -0
- metadata +15 -11
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -81,43 +81,53 @@ create one with [maf_index(1)][], like so:
|
|
81
81
|
|
82
82
|
|
83
83
|
$ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
|
84
|
-
|
85
|
-
Or programmatically:
|
86
84
|
|
87
|
-
|
88
|
-
|
89
|
-
|
85
|
+
To index all sequences for searching, not just the reference sequence:
|
86
|
+
|
87
|
+
$ maf_index --all test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
|
88
|
+
|
89
|
+
To build an index programmatically:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
require 'bio-maf'
|
93
|
+
parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
|
94
|
+
idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
|
95
|
+
```
|
90
96
|
|
91
97
|
### Extract blocks from an indexed MAF file, by genomic interval
|
92
98
|
|
93
99
|
Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
```ruby
|
102
|
+
require 'bio-maf'
|
103
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
97
104
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
105
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
106
|
+
access.find(q) do |block|
|
107
|
+
ref_seq = block.sequences[0]
|
108
|
+
puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
|
109
|
+
end
|
103
110
|
|
104
|
-
|
105
|
-
|
111
|
+
# => Matched block at 80082592, 121 bases
|
112
|
+
# => Matched block at 80082713, 54 bases
|
113
|
+
```
|
106
114
|
|
107
115
|
Or, equivalently, one can work with a specific MAF file and index directly:
|
108
116
|
|
109
|
-
|
110
|
-
|
111
|
-
|
117
|
+
```ruby
|
118
|
+
require 'bio-maf'
|
119
|
+
parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
|
120
|
+
idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
|
112
121
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
122
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
123
|
+
idx.find(q, parser).each do |block|
|
124
|
+
ref_seq = block.sequences[0]
|
125
|
+
puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
|
126
|
+
end
|
118
127
|
|
119
|
-
|
120
|
-
|
128
|
+
# => Matched block at 80082592, 121 bases
|
129
|
+
# => Matched block at 80082713, 54 bases
|
130
|
+
```
|
121
131
|
|
122
132
|
### Extract alignment blocks truncated to a given interval
|
123
133
|
|
@@ -125,25 +135,29 @@ Given a genomic interval of interest, one can also extract only the
|
|
125
135
|
subsets of blocks that intersect with that interval, using the
|
126
136
|
`#slice` method like so:
|
127
137
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
138
|
+
```ruby
|
139
|
+
require 'bio-maf'
|
140
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
141
|
+
int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
|
142
|
+
blocks = access.slice(int).to_a
|
143
|
+
puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
|
144
|
+
# => Got 2 blocks, first 18 base pairs.
|
145
|
+
```
|
134
146
|
|
135
147
|
### Filter species returned in alignment blocks
|
136
148
|
|
137
|
-
|
138
|
-
|
149
|
+
```ruby
|
150
|
+
require 'bio-maf'
|
151
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
139
152
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
153
|
+
access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
|
154
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
155
|
+
blocks = access.find(q)
|
156
|
+
block = blocks.first
|
157
|
+
puts "Block has #{block.sequences.size} sequences."
|
145
158
|
|
146
|
-
|
159
|
+
# => Block has 3 sequences.
|
160
|
+
```
|
147
161
|
|
148
162
|
### Extract blocks matching certain conditions
|
149
163
|
|
@@ -154,68 +168,80 @@ See also the [Cucumber feature][] and [step definitions][] for this.
|
|
154
168
|
|
155
169
|
#### Match only blocks with all specified species
|
156
170
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
171
|
+
```ruby
|
172
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
173
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
|
174
|
+
access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
|
175
|
+
n_blocks = access.find(q).count
|
176
|
+
# => 1
|
177
|
+
```
|
162
178
|
|
163
179
|
#### Match only blocks with a certain number of sequences
|
164
180
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
181
|
+
```ruby
|
182
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
183
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
|
184
|
+
access.block_filter = { :at_least_n_sequences => 6 }
|
185
|
+
n_blocks = access.find(q).count
|
186
|
+
# => 1
|
187
|
+
```
|
170
188
|
|
171
189
|
#### Match only blocks within a text size range
|
172
190
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
191
|
+
```ruby
|
192
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
193
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
|
194
|
+
access.block_filter = { :min_size => 72, :max_size => 160 }
|
195
|
+
n_blocks = access.find(q).count
|
196
|
+
# => 3
|
197
|
+
```
|
178
198
|
|
179
199
|
### Process each block in a MAF file
|
180
200
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
201
|
+
```ruby
|
202
|
+
require 'bio-maf'
|
203
|
+
p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
|
204
|
+
puts "MAF version: #{p.header.version}"
|
205
|
+
# => MAF version: 1
|
185
206
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
207
|
+
p.each_block do |block|
|
208
|
+
block.sequences.each do |seq|
|
209
|
+
do_something(seq)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
```
|
191
213
|
|
192
214
|
### Parse empty ('e') lines
|
193
215
|
|
194
216
|
Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
|
195
217
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
218
|
+
```ruby
|
219
|
+
require 'bio-maf'
|
220
|
+
p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
|
221
|
+
:parse_empty => false)
|
222
|
+
block = p.parse_block
|
223
|
+
block.sequences.size
|
224
|
+
# => 3
|
225
|
+
|
226
|
+
p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
|
227
|
+
:parse_empty => true)
|
228
|
+
block = p.parse_block
|
229
|
+
block.sequences.size
|
230
|
+
# => 4
|
231
|
+
block.sequences.find { |s| s.empty? }
|
232
|
+
# => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
|
233
|
+
# @source="turTru1.scaffold_109008", @start=25049,
|
234
|
+
# @size=1601, @strand=:+, @src_size=50103, @text=nil,
|
235
|
+
# @status="I">
|
236
|
+
```
|
213
237
|
|
214
238
|
Such options can also be set on a Bio::MAF::Access object:
|
215
239
|
|
216
|
-
|
217
|
-
|
218
|
-
|
240
|
+
```ruby
|
241
|
+
require 'bio-maf'
|
242
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
243
|
+
access.parse_options[:parse_empty] = true
|
244
|
+
```
|
219
245
|
|
220
246
|
### Remove gaps from parsed blocks
|
221
247
|
|
@@ -225,9 +251,11 @@ gaps may be left where there was an insertion present only in
|
|
225
251
|
sequences that were filtered out. Such gaps can be removed by setting
|
226
252
|
the `:remove_gaps` parser option:
|
227
253
|
|
228
|
-
|
229
|
-
|
230
|
-
|
254
|
+
```ruby
|
255
|
+
require 'bio-maf'
|
256
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
257
|
+
access.parse_options[:remove_gaps] = true
|
258
|
+
```
|
231
259
|
|
232
260
|
### Join blocks after filtering together
|
233
261
|
|
@@ -235,9 +263,11 @@ Similarly, filtering out species may remove a species which had caused
|
|
235
263
|
two adjacent alignment blocks to be split. By enabling the
|
236
264
|
`:join_blocks` parser option, such blocks can be joined together:
|
237
265
|
|
238
|
-
|
239
|
-
|
240
|
-
|
266
|
+
```ruby
|
267
|
+
require 'bio-maf'
|
268
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
269
|
+
access.parse_options[:join_blocks] = true
|
270
|
+
```
|
241
271
|
|
242
272
|
See the [Cucumber feature][] for more details.
|
243
273
|
|
@@ -254,14 +284,16 @@ more.
|
|
254
284
|
[Bio::BioAlignment::Alignment]: http://rdoc.info/gems/bio-alignment/Bio/BioAlignment/Alignment
|
255
285
|
[bio-alignment]: https://github.com/pjotrp/bioruby-alignment
|
256
286
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
287
|
+
```ruby
|
288
|
+
require 'bio-maf'
|
289
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
290
|
+
access.parse_options[:as_bio_alignment] = true
|
291
|
+
q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
|
292
|
+
access.find(q) do |aln|
|
293
|
+
col = aln.columns[3]
|
294
|
+
puts "bases in column 3: #{col}"
|
295
|
+
end
|
296
|
+
```
|
265
297
|
|
266
298
|
### Tile blocks together over an interval
|
267
299
|
|
@@ -276,23 +308,25 @@ man page.
|
|
276
308
|
|
277
309
|
[feature]: https://github.com/csw/bioruby-maf/blob/master/features/tiling.feature
|
278
310
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
311
|
+
```ruby
|
312
|
+
require 'bio-maf'
|
313
|
+
access = Bio::MAF::Access.maf_dir('test/data')
|
314
|
+
interval = Bio::GenomicInterval.zero_based('mm8.chr7',
|
315
|
+
80082334,
|
316
|
+
80082468)
|
317
|
+
access.tile(interval) do |tiler|
|
318
|
+
# reference is optional
|
319
|
+
tiler.reference = 'reference.fa.gz'
|
320
|
+
tiler.species = %w(mm8 rn4 hg18)
|
321
|
+
# species_map is optional
|
322
|
+
tiler.species_map = {
|
323
|
+
'mm8' => 'mouse',
|
324
|
+
'rn4' => 'rat',
|
325
|
+
'hg18' => 'human'
|
326
|
+
}
|
327
|
+
tiler.write_fasta($stdout)
|
328
|
+
end
|
329
|
+
```
|
296
330
|
|
297
331
|
### Command line tools
|
298
332
|
|
data/bin/maf_count
CHANGED
data/bin/maf_dump_blocks
CHANGED
data/bin/maf_extract
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bio-maf'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
|
7
|
+
include Bio::MAF
|
8
|
+
|
9
|
+
options = OpenStruct.new
|
10
|
+
options.mode = :intersect
|
11
|
+
options.format = :maf
|
12
|
+
options.seq_filter = {}
|
13
|
+
options.block_filter = {}
|
14
|
+
options.parse_options = {}
|
15
|
+
|
16
|
+
def handle_list_spec(spec)
|
17
|
+
if spec =~ /^@(.+)/
|
18
|
+
File.read($1).split
|
19
|
+
else
|
20
|
+
spec.split(',')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def handle_interval_spec(int)
|
25
|
+
parts = int.split(':')
|
26
|
+
Bio::GenomicInterval.zero_based(parts[0], parts[1].to_i, parts[2].to_i)
|
27
|
+
end
|
28
|
+
|
29
|
+
$op = OptionParser.new do |opts|
|
30
|
+
opts.banner = "Usage: maf_extract (-m MAF [-i INDEX] | -d MAFDIR) [options]"
|
31
|
+
opts.separator ""
|
32
|
+
opts.separator "MAF source options (either --maf or --maf-dir must be given):"
|
33
|
+
opts.on("-m", "--maf MAF", "MAF file") do |maf|
|
34
|
+
options.maf = maf
|
35
|
+
end
|
36
|
+
opts.on("-i", "--index INDEX", "MAF index") do |idx|
|
37
|
+
options.idx = idx
|
38
|
+
end
|
39
|
+
opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
|
40
|
+
options.maf_dir = dir
|
41
|
+
end
|
42
|
+
opts.separator ""
|
43
|
+
opts.separator "Extraction options:"
|
44
|
+
opts.on("--mode MODE", [:intersect, :slice],
|
45
|
+
"Extraction mode; 'intersect' to match ",
|
46
|
+
"blocks intersecting the given region,",
|
47
|
+
"or 'slice' to extract subsets covering ",
|
48
|
+
"given regions") do |mode|
|
49
|
+
options.mode = mode
|
50
|
+
end
|
51
|
+
opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
|
52
|
+
options.bed = bed
|
53
|
+
end
|
54
|
+
opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
|
55
|
+
options.interval = handle_interval_spec(int)
|
56
|
+
end
|
57
|
+
opts.separator ""
|
58
|
+
opts.separator "Output options:"
|
59
|
+
opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
|
60
|
+
options.format = fmt
|
61
|
+
end
|
62
|
+
opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
|
63
|
+
options.out_path = out
|
64
|
+
end
|
65
|
+
opts.separator ""
|
66
|
+
opts.separator "Filtering options:"
|
67
|
+
opts.on("--only-species SPECIES",
|
68
|
+
"Filter out all but the species in the",
|
69
|
+
"given comma-separated list",
|
70
|
+
"(or @FILE to read from a file)") do |spec|
|
71
|
+
options.seq_filter[:only_species] = handle_list_spec(spec)
|
72
|
+
end
|
73
|
+
opts.on("--with-all-species SPECIES",
|
74
|
+
"Only match blocks with all the given",
|
75
|
+
"species, comma-separated",
|
76
|
+
"(or @FILE to read from a file)") do |spec|
|
77
|
+
options.block_filter[:with_all_species] = handle_list_spec(spec)
|
78
|
+
end
|
79
|
+
opts.on("--min-sequences N", Integer,
|
80
|
+
"Match only blocks with at least N sequences") do |n|
|
81
|
+
options.block_filter[:at_least_n_sequences] = n
|
82
|
+
end
|
83
|
+
opts.on("--min-text-size N", Integer,
|
84
|
+
"Match only blocks with minimum text size N") do |n|
|
85
|
+
options.block_filter[:min_size] = n
|
86
|
+
end
|
87
|
+
opts.on("--max-text-size N", Integer,
|
88
|
+
"Match only blocks with maximum text size N") do |n|
|
89
|
+
options.block_filter[:max_size] = n
|
90
|
+
end
|
91
|
+
opts.separator ""
|
92
|
+
opts.separator "Block processing options:"
|
93
|
+
opts.on("--join-blocks",
|
94
|
+
"Join blocks if appropriate after filtering",
|
95
|
+
"out sequences") do
|
96
|
+
options.parse_options[:join_blocks] = true
|
97
|
+
end
|
98
|
+
opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
|
99
|
+
options.parse_options[:remove_gaps] = true
|
100
|
+
end
|
101
|
+
opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
|
102
|
+
options.parse_options[:parse_extended] = true
|
103
|
+
end
|
104
|
+
opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
|
105
|
+
options.parse_options[:parse_empty] = true
|
106
|
+
end
|
107
|
+
opts.separator ""
|
108
|
+
opts.separator "Logging options:"
|
109
|
+
Bio::MAF::handle_logging_options(opts)
|
110
|
+
end
|
111
|
+
$op.parse!(ARGV)
|
112
|
+
Bio::Log::CLI.configure('bio-maf')
|
113
|
+
|
114
|
+
def usage(msg)
|
115
|
+
$stderr.puts msg
|
116
|
+
$stderr.puts $op
|
117
|
+
exit 2
|
118
|
+
end
|
119
|
+
|
120
|
+
if options.maf
|
121
|
+
access = Access.file(options.maf, options.idx, options.parse_options)
|
122
|
+
elsif options.maf_dir
|
123
|
+
access = Access.maf_dir(options.maf_dir, options.parse_options)
|
124
|
+
else
|
125
|
+
usage "Must supply --maf or --maf-dir!"
|
126
|
+
end
|
127
|
+
|
128
|
+
begin
|
129
|
+
access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
|
130
|
+
access.block_filter = options.block_filter unless options.block_filter.empty?
|
131
|
+
if options.out_path
|
132
|
+
outf = File.open(options.out_path, 'w')
|
133
|
+
else
|
134
|
+
outf = $stdout
|
135
|
+
end
|
136
|
+
|
137
|
+
case options.format
|
138
|
+
when :maf
|
139
|
+
writer = Writer.new(outf)
|
140
|
+
when :fasta
|
141
|
+
writer = FASTAWriter.new(outf)
|
142
|
+
else
|
143
|
+
raise "unsupported output format #{format}!"
|
144
|
+
end
|
145
|
+
|
146
|
+
if options.bed
|
147
|
+
intervals = read_bed_intervals(options.bed)
|
148
|
+
elsif options.interval
|
149
|
+
intervals = [options.interval]
|
150
|
+
else
|
151
|
+
usage "Must supply --interval or --bed!"
|
152
|
+
end
|
153
|
+
|
154
|
+
# TODO: provide access to original MAF header?
|
155
|
+
if options.format == :maf
|
156
|
+
writer.write_header(Header.default)
|
157
|
+
end
|
158
|
+
|
159
|
+
case options.mode
|
160
|
+
when :intersect
|
161
|
+
access.find(intervals) do |block|
|
162
|
+
writer.write_block(block)
|
163
|
+
end
|
164
|
+
when :slice
|
165
|
+
# TODO: multiple files if intervals.size > 1?
|
166
|
+
intervals.each do |interval|
|
167
|
+
access.slice(interval) do |block|
|
168
|
+
writer.write_block(block)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
else
|
172
|
+
raise "Unsupported mode #{options.mode}!"
|
173
|
+
end
|
174
|
+
|
175
|
+
ensure
|
176
|
+
access.close
|
177
|
+
end
|