bio-maf 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -13,6 +13,7 @@ group :development do
13
13
  gem "redcarpet", "~> 2.1.1", :platforms => :mri
14
14
  gem "ronn", "~> 0.7.3", :platforms => :mri
15
15
  gem "sinatra", "~> 1.3.2" # for ronn --server
16
+ gem "jruby-openssl", ">= 0.7", :platforms => :jruby
16
17
  end
17
18
 
18
19
  group :test do
data/README.md CHANGED
@@ -81,43 +81,53 @@ create one with [maf_index(1)][], like so:
81
81
 
82
82
 
83
83
  $ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
84
-
85
- Or programmatically:
86
84
 
87
- require 'bio-maf'
88
- parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
89
- idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct")
85
+ To index all sequences for searching, not just the reference sequence:
86
+
87
+ $ maf_index --all test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
88
+
89
+ To build an index programmatically:
90
+
91
+ ```ruby
92
+ require 'bio-maf'
93
+ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
94
+ idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
95
+ ```
90
96
 
91
97
  ### Extract blocks from an indexed MAF file, by genomic interval
92
98
 
93
99
  Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
94
100
 
95
- require 'bio-maf'
96
- access = Bio::MAF::Access.maf_dir('test/data')
101
+ ```ruby
102
+ require 'bio-maf'
103
+ access = Bio::MAF::Access.maf_dir('test/data')
97
104
 
98
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
99
- access.find(q) do |block|
100
- ref_seq = block.sequences[0]
101
- puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
102
- end
105
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
106
+ access.find(q) do |block|
107
+ ref_seq = block.sequences[0]
108
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
109
+ end
103
110
 
104
- # => Matched block at 80082592, 121 bases
105
- # => Matched block at 80082713, 54 bases
111
+ # => Matched block at 80082592, 121 bases
112
+ # => Matched block at 80082713, 54 bases
113
+ ```
106
114
 
107
115
  Or, equivalently, one can work with a specific MAF file and index directly:
108
116
 
109
- require 'bio-maf'
110
- parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
111
- idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
117
+ ```ruby
118
+ require 'bio-maf'
119
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
120
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
112
121
 
113
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
114
- idx.find(q, parser).each do |block|
115
- ref_seq = block.sequences[0]
116
- puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
117
- end
122
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
123
+ idx.find(q, parser).each do |block|
124
+ ref_seq = block.sequences[0]
125
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
126
+ end
118
127
 
119
- # => Matched block at 80082592, 121 bases
120
- # => Matched block at 80082713, 54 bases
128
+ # => Matched block at 80082592, 121 bases
129
+ # => Matched block at 80082713, 54 bases
130
+ ```
121
131
 
122
132
  ### Extract alignment blocks truncated to a given interval
123
133
 
@@ -125,25 +135,29 @@ Given a genomic interval of interest, one can also extract only the
125
135
  subsets of blocks that intersect with that interval, using the
126
136
  `#slice` method like so:
127
137
 
128
- require 'bio-maf'
129
- access = Bio::MAF::Access.maf_dir('test/data')
130
- int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
131
- blocks = access.slice(int).to_a
132
- puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
133
- # => Got 2 blocks, first 18 base pairs.
138
+ ```ruby
139
+ require 'bio-maf'
140
+ access = Bio::MAF::Access.maf_dir('test/data')
141
+ int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
142
+ blocks = access.slice(int).to_a
143
+ puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
144
+ # => Got 2 blocks, first 18 base pairs.
145
+ ```
134
146
 
135
147
  ### Filter species returned in alignment blocks
136
148
 
137
- require 'bio-maf'
138
- access = Bio::MAF::Access.maf_dir('test/data')
149
+ ```ruby
150
+ require 'bio-maf'
151
+ access = Bio::MAF::Access.maf_dir('test/data')
139
152
 
140
- access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
141
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
142
- blocks = access.find(q)
143
- block = blocks.first
144
- puts "Block has #{block.sequences.size} sequences."
153
+ access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
154
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
155
+ blocks = access.find(q)
156
+ block = blocks.first
157
+ puts "Block has #{block.sequences.size} sequences."
145
158
 
146
- # => Block has 3 sequences.
159
+ # => Block has 3 sequences.
160
+ ```
147
161
 
148
162
  ### Extract blocks matching certain conditions
149
163
 
@@ -154,68 +168,80 @@ See also the [Cucumber feature][] and [step definitions][] for this.
154
168
 
155
169
  #### Match only blocks with all specified species
156
170
 
157
- access = Bio::MAF::Access.maf_dir('test/data')
158
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
159
- access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
160
- n_blocks = access.find(q).count
161
- # => 1
171
+ ```ruby
172
+ access = Bio::MAF::Access.maf_dir('test/data')
173
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
174
+ access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
175
+ n_blocks = access.find(q).count
176
+ # => 1
177
+ ```
162
178
 
163
179
  #### Match only blocks with a certain number of sequences
164
180
 
165
- access = Bio::MAF::Access.maf_dir('test/data')
166
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
167
- access.block_filter = { :at_least_n_sequences => 6 }
168
- n_blocks = access.find(q).count
169
- # => 1
181
+ ```ruby
182
+ access = Bio::MAF::Access.maf_dir('test/data')
183
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
184
+ access.block_filter = { :at_least_n_sequences => 6 }
185
+ n_blocks = access.find(q).count
186
+ # => 1
187
+ ```
170
188
 
171
189
  #### Match only blocks within a text size range
172
190
 
173
- access = Bio::MAF::Access.maf_dir('test/data')
174
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
175
- access.block_filter = { :min_size => 72, :max_size => 160 }
176
- n_blocks = access.find(q).count
177
- # => 3
191
+ ```ruby
192
+ access = Bio::MAF::Access.maf_dir('test/data')
193
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
194
+ access.block_filter = { :min_size => 72, :max_size => 160 }
195
+ n_blocks = access.find(q).count
196
+ # => 3
197
+ ```
178
198
 
179
199
  ### Process each block in a MAF file
180
200
 
181
- require 'bio-maf'
182
- p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
183
- puts "MAF version: #{p.header.version}"
184
- # => MAF version: 1
201
+ ```ruby
202
+ require 'bio-maf'
203
+ p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
204
+ puts "MAF version: #{p.header.version}"
205
+ # => MAF version: 1
185
206
 
186
- p.each_block do |block|
187
- block.sequences.each do |seq|
188
- do_something(seq)
189
- end
190
- end
207
+ p.each_block do |block|
208
+ block.sequences.each do |seq|
209
+ do_something(seq)
210
+ end
211
+ end
212
+ ```
191
213
 
192
214
  ### Parse empty ('e') lines
193
215
 
194
216
  Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
195
217
 
196
- require 'bio-maf'
197
- p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
198
- :parse_empty => false)
199
- block = p.parse_block
200
- block.sequences.size
201
- # => 3
202
-
203
- p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
204
- :parse_empty => true)
205
- block = p.parse_block
206
- block.sequences.size
207
- # => 4
208
- block.sequences.find { |s| s.empty? }
209
- # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
210
- # @source="turTru1.scaffold_109008", @start=25049,
211
- # @size=1601, @strand=:+, @src_size=50103, @text=nil,
212
- # @status="I">
218
+ ```ruby
219
+ require 'bio-maf'
220
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
221
+ :parse_empty => false)
222
+ block = p.parse_block
223
+ block.sequences.size
224
+ # => 3
225
+
226
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
227
+ :parse_empty => true)
228
+ block = p.parse_block
229
+ block.sequences.size
230
+ # => 4
231
+ block.sequences.find { |s| s.empty? }
232
+ # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
233
+ # @source="turTru1.scaffold_109008", @start=25049,
234
+ # @size=1601, @strand=:+, @src_size=50103, @text=nil,
235
+ # @status="I">
236
+ ```
213
237
 
214
238
  Such options can also be set on a Bio::MAF::Access object:
215
239
 
216
- require 'bio-maf'
217
- access = Bio::MAF::Access.maf_dir('test/data')
218
- access.parse_options[:parse_empty] = true
240
+ ```ruby
241
+ require 'bio-maf'
242
+ access = Bio::MAF::Access.maf_dir('test/data')
243
+ access.parse_options[:parse_empty] = true
244
+ ```
219
245
 
220
246
  ### Remove gaps from parsed blocks
221
247
 
@@ -225,9 +251,11 @@ gaps may be left where there was an insertion present only in
225
251
  sequences that were filtered out. Such gaps can be removed by setting
226
252
  the `:remove_gaps` parser option:
227
253
 
228
- require 'bio-maf'
229
- access = Bio::MAF::Access.maf_dir('test/data')
230
- access.parse_options[:remove_gaps] = true
254
+ ```ruby
255
+ require 'bio-maf'
256
+ access = Bio::MAF::Access.maf_dir('test/data')
257
+ access.parse_options[:remove_gaps] = true
258
+ ```
231
259
 
232
260
  ### Join blocks after filtering together
233
261
 
@@ -235,9 +263,11 @@ Similarly, filtering out species may remove a species which had caused
235
263
  two adjacent alignment blocks to be split. By enabling the
236
264
  `:join_blocks` parser option, such blocks can be joined together:
237
265
 
238
- require 'bio-maf'
239
- access = Bio::MAF::Access.maf_dir('test/data')
240
- access.parse_options[:join_blocks] = true
266
+ ```ruby
267
+ require 'bio-maf'
268
+ access = Bio::MAF::Access.maf_dir('test/data')
269
+ access.parse_options[:join_blocks] = true
270
+ ```
241
271
 
242
272
  See the [Cucumber feature][] for more details.
243
273
 
@@ -254,14 +284,16 @@ more.
254
284
  [Bio::BioAlignment::Alignment]: http://rdoc.info/gems/bio-alignment/Bio/BioAlignment/Alignment
255
285
  [bio-alignment]: https://github.com/pjotrp/bioruby-alignment
256
286
 
257
- require 'bio-maf'
258
- access = Bio::MAF::Access.maf_dir('test/data')
259
- access.parse_options[:as_bio_alignment] = true
260
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
261
- access.find(q) do |aln|
262
- col = aln.columns[3]
263
- puts "bases in column 3: #{col}"
264
- end
287
+ ```ruby
288
+ require 'bio-maf'
289
+ access = Bio::MAF::Access.maf_dir('test/data')
290
+ access.parse_options[:as_bio_alignment] = true
291
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
292
+ access.find(q) do |aln|
293
+ col = aln.columns[3]
294
+ puts "bases in column 3: #{col}"
295
+ end
296
+ ```
265
297
 
266
298
  ### Tile blocks together over an interval
267
299
 
@@ -276,23 +308,25 @@ man page.
276
308
 
277
309
  [feature]: https://github.com/csw/bioruby-maf/blob/master/features/tiling.feature
278
310
 
279
- require 'bio-maf'
280
- access = Bio::MAF::Access.maf_dir('test/data')
281
- interval = Bio::GenomicInterval.zero_based('mm8.chr7',
282
- 80082334,
283
- 80082468)
284
- access.tile(interval) do |tiler|
285
- # reference is optional
286
- tiler.reference = 'reference.fa.gz'
287
- tiler.species = %w(mm8 rn4 hg18)
288
- # species_map is optional
289
- tiler.species_map = {
290
- 'mm8' => 'mouse',
291
- 'rn4' => 'rat',
292
- 'hg18' => 'human'
293
- }
294
- tiler.write_fasta($stdout)
295
- end
311
+ ```ruby
312
+ require 'bio-maf'
313
+ access = Bio::MAF::Access.maf_dir('test/data')
314
+ interval = Bio::GenomicInterval.zero_based('mm8.chr7',
315
+ 80082334,
316
+ 80082468)
317
+ access.tile(interval) do |tiler|
318
+ # reference is optional
319
+ tiler.reference = 'reference.fa.gz'
320
+ tiler.species = %w(mm8 rn4 hg18)
321
+ # species_map is optional
322
+ tiler.species_map = {
323
+ 'mm8' => 'mouse',
324
+ 'rn4' => 'rat',
325
+ 'hg18' => 'human'
326
+ }
327
+ tiler.write_fasta($stdout)
328
+ end
329
+ ```
296
330
 
297
331
  ### Command line tools
298
332
 
data/bin/maf_count CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
data/bin/maf_dump_blocks CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
data/bin/maf_extract ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+
7
+ include Bio::MAF
8
+
9
+ options = OpenStruct.new
10
+ options.mode = :intersect
11
+ options.format = :maf
12
+ options.seq_filter = {}
13
+ options.block_filter = {}
14
+ options.parse_options = {}
15
+
16
+ def handle_list_spec(spec)
17
+ if spec =~ /^@(.+)/
18
+ File.read($1).split
19
+ else
20
+ spec.split(',')
21
+ end
22
+ end
23
+
24
+ def handle_interval_spec(int)
25
+ parts = int.split(':')
26
+ Bio::GenomicInterval.zero_based(parts[0], parts[1].to_i, parts[2].to_i)
27
+ end
28
+
29
+ $op = OptionParser.new do |opts|
30
+ opts.banner = "Usage: maf_extract (-m MAF [-i INDEX] | -d MAFDIR) [options]"
31
+ opts.separator ""
32
+ opts.separator "MAF source options (either --maf or --maf-dir must be given):"
33
+ opts.on("-m", "--maf MAF", "MAF file") do |maf|
34
+ options.maf = maf
35
+ end
36
+ opts.on("-i", "--index INDEX", "MAF index") do |idx|
37
+ options.idx = idx
38
+ end
39
+ opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
40
+ options.maf_dir = dir
41
+ end
42
+ opts.separator ""
43
+ opts.separator "Extraction options:"
44
+ opts.on("--mode MODE", [:intersect, :slice],
45
+ "Extraction mode; 'intersect' to match ",
46
+ "blocks intersecting the given region,",
47
+ "or 'slice' to extract subsets covering ",
48
+ "given regions") do |mode|
49
+ options.mode = mode
50
+ end
51
+ opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
52
+ options.bed = bed
53
+ end
54
+ opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
55
+ options.interval = handle_interval_spec(int)
56
+ end
57
+ opts.separator ""
58
+ opts.separator "Output options:"
59
+ opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
60
+ options.format = fmt
61
+ end
62
+ opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
63
+ options.out_path = out
64
+ end
65
+ opts.separator ""
66
+ opts.separator "Filtering options:"
67
+ opts.on("--only-species SPECIES",
68
+ "Filter out all but the species in the",
69
+ "given comma-separated list",
70
+ "(or @FILE to read from a file)") do |spec|
71
+ options.seq_filter[:only_species] = handle_list_spec(spec)
72
+ end
73
+ opts.on("--with-all-species SPECIES",
74
+ "Only match blocks with all the given",
75
+ "species, comma-separated",
76
+ "(or @FILE to read from a file)") do |spec|
77
+ options.block_filter[:with_all_species] = handle_list_spec(spec)
78
+ end
79
+ opts.on("--min-sequences N", Integer,
80
+ "Match only blocks with at least N sequences") do |n|
81
+ options.block_filter[:at_least_n_sequences] = n
82
+ end
83
+ opts.on("--min-text-size N", Integer,
84
+ "Match only blocks with minimum text size N") do |n|
85
+ options.block_filter[:min_size] = n
86
+ end
87
+ opts.on("--max-text-size N", Integer,
88
+ "Match only blocks with maximum text size N") do |n|
89
+ options.block_filter[:max_size] = n
90
+ end
91
+ opts.separator ""
92
+ opts.separator "Block processing options:"
93
+ opts.on("--join-blocks",
94
+ "Join blocks if appropriate after filtering",
95
+ "out sequences") do
96
+ options.parse_options[:join_blocks] = true
97
+ end
98
+ opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
99
+ options.parse_options[:remove_gaps] = true
100
+ end
101
+ opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
102
+ options.parse_options[:parse_extended] = true
103
+ end
104
+ opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
105
+ options.parse_options[:parse_empty] = true
106
+ end
107
+ opts.separator ""
108
+ opts.separator "Logging options:"
109
+ Bio::MAF::handle_logging_options(opts)
110
+ end
111
+ $op.parse!(ARGV)
112
+ Bio::Log::CLI.configure('bio-maf')
113
+
114
+ def usage(msg)
115
+ $stderr.puts msg
116
+ $stderr.puts $op
117
+ exit 2
118
+ end
119
+
120
+ if options.maf
121
+ access = Access.file(options.maf, options.idx, options.parse_options)
122
+ elsif options.maf_dir
123
+ access = Access.maf_dir(options.maf_dir, options.parse_options)
124
+ else
125
+ usage "Must supply --maf or --maf-dir!"
126
+ end
127
+
128
+ begin
129
+ access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
130
+ access.block_filter = options.block_filter unless options.block_filter.empty?
131
+ if options.out_path
132
+ outf = File.open(options.out_path, 'w')
133
+ else
134
+ outf = $stdout
135
+ end
136
+
137
+ case options.format
138
+ when :maf
139
+ writer = Writer.new(outf)
140
+ when :fasta
141
+ writer = FASTAWriter.new(outf)
142
+ else
143
+ raise "unsupported output format #{format}!"
144
+ end
145
+
146
+ if options.bed
147
+ intervals = read_bed_intervals(options.bed)
148
+ elsif options.interval
149
+ intervals = [options.interval]
150
+ else
151
+ usage "Must supply --interval or --bed!"
152
+ end
153
+
154
+ # TODO: provide access to original MAF header?
155
+ if options.format == :maf
156
+ writer.write_header(Header.default)
157
+ end
158
+
159
+ case options.mode
160
+ when :intersect
161
+ access.find(intervals) do |block|
162
+ writer.write_block(block)
163
+ end
164
+ when :slice
165
+ # TODO: multiple files if intervals.size > 1?
166
+ intervals.each do |interval|
167
+ access.slice(interval) do |block|
168
+ writer.write_block(block)
169
+ end
170
+ end
171
+ else
172
+ raise "Unsupported mode #{options.mode}!"
173
+ end
174
+
175
+ ensure
176
+ access.close
177
+ end