bio-maf 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -13,6 +13,7 @@ group :development do
13
13
  gem "redcarpet", "~> 2.1.1", :platforms => :mri
14
14
  gem "ronn", "~> 0.7.3", :platforms => :mri
15
15
  gem "sinatra", "~> 1.3.2" # for ronn --server
16
+ gem "jruby-openssl", ">= 0.7", :platforms => :jruby
16
17
  end
17
18
 
18
19
  group :test do
data/README.md CHANGED
@@ -81,43 +81,53 @@ create one with [maf_index(1)][], like so:
81
81
 
82
82
 
83
83
  $ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
84
-
85
- Or programmatically:
86
84
 
87
- require 'bio-maf'
88
- parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
89
- idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct")
85
+ To index all sequences for searching, not just the reference sequence:
86
+
87
+ $ maf_index --all test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
88
+
89
+ To build an index programmatically:
90
+
91
+ ```ruby
92
+ require 'bio-maf'
93
+ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
94
+ idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct", false)
95
+ ```
90
96
 
91
97
  ### Extract blocks from an indexed MAF file, by genomic interval
92
98
 
93
99
  Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
94
100
 
95
- require 'bio-maf'
96
- access = Bio::MAF::Access.maf_dir('test/data')
101
+ ```ruby
102
+ require 'bio-maf'
103
+ access = Bio::MAF::Access.maf_dir('test/data')
97
104
 
98
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
99
- access.find(q) do |block|
100
- ref_seq = block.sequences[0]
101
- puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
102
- end
105
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
106
+ access.find(q) do |block|
107
+ ref_seq = block.sequences[0]
108
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
109
+ end
103
110
 
104
- # => Matched block at 80082592, 121 bases
105
- # => Matched block at 80082713, 54 bases
111
+ # => Matched block at 80082592, 121 bases
112
+ # => Matched block at 80082713, 54 bases
113
+ ```
106
114
 
107
115
  Or, equivalently, one can work with a specific MAF file and index directly:
108
116
 
109
- require 'bio-maf'
110
- parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
111
- idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
117
+ ```ruby
118
+ require 'bio-maf'
119
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
120
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
112
121
 
113
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
114
- idx.find(q, parser).each do |block|
115
- ref_seq = block.sequences[0]
116
- puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
117
- end
122
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
123
+ idx.find(q, parser).each do |block|
124
+ ref_seq = block.sequences[0]
125
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
126
+ end
118
127
 
119
- # => Matched block at 80082592, 121 bases
120
- # => Matched block at 80082713, 54 bases
128
+ # => Matched block at 80082592, 121 bases
129
+ # => Matched block at 80082713, 54 bases
130
+ ```
121
131
 
122
132
  ### Extract alignment blocks truncated to a given interval
123
133
 
@@ -125,25 +135,29 @@ Given a genomic interval of interest, one can also extract only the
125
135
  subsets of blocks that intersect with that interval, using the
126
136
  `#slice` method like so:
127
137
 
128
- require 'bio-maf'
129
- access = Bio::MAF::Access.maf_dir('test/data')
130
- int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
131
- blocks = access.slice(int).to_a
132
- puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
133
- # => Got 2 blocks, first 18 base pairs.
138
+ ```ruby
139
+ require 'bio-maf'
140
+ access = Bio::MAF::Access.maf_dir('test/data')
141
+ int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
142
+ blocks = access.slice(int).to_a
143
+ puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
144
+ # => Got 2 blocks, first 18 base pairs.
145
+ ```
134
146
 
135
147
  ### Filter species returned in alignment blocks
136
148
 
137
- require 'bio-maf'
138
- access = Bio::MAF::Access.maf_dir('test/data')
149
+ ```ruby
150
+ require 'bio-maf'
151
+ access = Bio::MAF::Access.maf_dir('test/data')
139
152
 
140
- access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
141
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
142
- blocks = access.find(q)
143
- block = blocks.first
144
- puts "Block has #{block.sequences.size} sequences."
153
+ access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
154
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
155
+ blocks = access.find(q)
156
+ block = blocks.first
157
+ puts "Block has #{block.sequences.size} sequences."
145
158
 
146
- # => Block has 3 sequences.
159
+ # => Block has 3 sequences.
160
+ ```
147
161
 
148
162
  ### Extract blocks matching certain conditions
149
163
 
@@ -154,68 +168,80 @@ See also the [Cucumber feature][] and [step definitions][] for this.
154
168
 
155
169
  #### Match only blocks with all specified species
156
170
 
157
- access = Bio::MAF::Access.maf_dir('test/data')
158
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
159
- access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
160
- n_blocks = access.find(q).count
161
- # => 1
171
+ ```ruby
172
+ access = Bio::MAF::Access.maf_dir('test/data')
173
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
174
+ access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
175
+ n_blocks = access.find(q).count
176
+ # => 1
177
+ ```
162
178
 
163
179
  #### Match only blocks with a certain number of sequences
164
180
 
165
- access = Bio::MAF::Access.maf_dir('test/data')
166
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
167
- access.block_filter = { :at_least_n_sequences => 6 }
168
- n_blocks = access.find(q).count
169
- # => 1
181
+ ```ruby
182
+ access = Bio::MAF::Access.maf_dir('test/data')
183
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
184
+ access.block_filter = { :at_least_n_sequences => 6 }
185
+ n_blocks = access.find(q).count
186
+ # => 1
187
+ ```
170
188
 
171
189
  #### Match only blocks within a text size range
172
190
 
173
- access = Bio::MAF::Access.maf_dir('test/data')
174
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
175
- access.block_filter = { :min_size => 72, :max_size => 160 }
176
- n_blocks = access.find(q).count
177
- # => 3
191
+ ```ruby
192
+ access = Bio::MAF::Access.maf_dir('test/data')
193
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
194
+ access.block_filter = { :min_size => 72, :max_size => 160 }
195
+ n_blocks = access.find(q).count
196
+ # => 3
197
+ ```
178
198
 
179
199
  ### Process each block in a MAF file
180
200
 
181
- require 'bio-maf'
182
- p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
183
- puts "MAF version: #{p.header.version}"
184
- # => MAF version: 1
201
+ ```ruby
202
+ require 'bio-maf'
203
+ p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
204
+ puts "MAF version: #{p.header.version}"
205
+ # => MAF version: 1
185
206
 
186
- p.each_block do |block|
187
- block.sequences.each do |seq|
188
- do_something(seq)
189
- end
190
- end
207
+ p.each_block do |block|
208
+ block.sequences.each do |seq|
209
+ do_something(seq)
210
+ end
211
+ end
212
+ ```
191
213
 
192
214
  ### Parse empty ('e') lines
193
215
 
194
216
  Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
195
217
 
196
- require 'bio-maf'
197
- p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
198
- :parse_empty => false)
199
- block = p.parse_block
200
- block.sequences.size
201
- # => 3
202
-
203
- p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
204
- :parse_empty => true)
205
- block = p.parse_block
206
- block.sequences.size
207
- # => 4
208
- block.sequences.find { |s| s.empty? }
209
- # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
210
- # @source="turTru1.scaffold_109008", @start=25049,
211
- # @size=1601, @strand=:+, @src_size=50103, @text=nil,
212
- # @status="I">
218
+ ```ruby
219
+ require 'bio-maf'
220
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
221
+ :parse_empty => false)
222
+ block = p.parse_block
223
+ block.sequences.size
224
+ # => 3
225
+
226
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
227
+ :parse_empty => true)
228
+ block = p.parse_block
229
+ block.sequences.size
230
+ # => 4
231
+ block.sequences.find { |s| s.empty? }
232
+ # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
233
+ # @source="turTru1.scaffold_109008", @start=25049,
234
+ # @size=1601, @strand=:+, @src_size=50103, @text=nil,
235
+ # @status="I">
236
+ ```
213
237
 
214
238
  Such options can also be set on a Bio::MAF::Access object:
215
239
 
216
- require 'bio-maf'
217
- access = Bio::MAF::Access.maf_dir('test/data')
218
- access.parse_options[:parse_empty] = true
240
+ ```ruby
241
+ require 'bio-maf'
242
+ access = Bio::MAF::Access.maf_dir('test/data')
243
+ access.parse_options[:parse_empty] = true
244
+ ```
219
245
 
220
246
  ### Remove gaps from parsed blocks
221
247
 
@@ -225,9 +251,11 @@ gaps may be left where there was an insertion present only in
225
251
  sequences that were filtered out. Such gaps can be removed by setting
226
252
  the `:remove_gaps` parser option:
227
253
 
228
- require 'bio-maf'
229
- access = Bio::MAF::Access.maf_dir('test/data')
230
- access.parse_options[:remove_gaps] = true
254
+ ```ruby
255
+ require 'bio-maf'
256
+ access = Bio::MAF::Access.maf_dir('test/data')
257
+ access.parse_options[:remove_gaps] = true
258
+ ```
231
259
 
232
260
  ### Join blocks after filtering together
233
261
 
@@ -235,9 +263,11 @@ Similarly, filtering out species may remove a species which had caused
235
263
  two adjacent alignment blocks to be split. By enabling the
236
264
  `:join_blocks` parser option, such blocks can be joined together:
237
265
 
238
- require 'bio-maf'
239
- access = Bio::MAF::Access.maf_dir('test/data')
240
- access.parse_options[:join_blocks] = true
266
+ ```ruby
267
+ require 'bio-maf'
268
+ access = Bio::MAF::Access.maf_dir('test/data')
269
+ access.parse_options[:join_blocks] = true
270
+ ```
241
271
 
242
272
  See the [Cucumber feature][] for more details.
243
273
 
@@ -254,14 +284,16 @@ more.
254
284
  [Bio::BioAlignment::Alignment]: http://rdoc.info/gems/bio-alignment/Bio/BioAlignment/Alignment
255
285
  [bio-alignment]: https://github.com/pjotrp/bioruby-alignment
256
286
 
257
- require 'bio-maf'
258
- access = Bio::MAF::Access.maf_dir('test/data')
259
- access.parse_options[:as_bio_alignment] = true
260
- q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
261
- access.find(q) do |aln|
262
- col = aln.columns[3]
263
- puts "bases in column 3: #{col}"
264
- end
287
+ ```ruby
288
+ require 'bio-maf'
289
+ access = Bio::MAF::Access.maf_dir('test/data')
290
+ access.parse_options[:as_bio_alignment] = true
291
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
292
+ access.find(q) do |aln|
293
+ col = aln.columns[3]
294
+ puts "bases in column 3: #{col}"
295
+ end
296
+ ```
265
297
 
266
298
  ### Tile blocks together over an interval
267
299
 
@@ -276,23 +308,25 @@ man page.
276
308
 
277
309
  [feature]: https://github.com/csw/bioruby-maf/blob/master/features/tiling.feature
278
310
 
279
- require 'bio-maf'
280
- access = Bio::MAF::Access.maf_dir('test/data')
281
- interval = Bio::GenomicInterval.zero_based('mm8.chr7',
282
- 80082334,
283
- 80082468)
284
- access.tile(interval) do |tiler|
285
- # reference is optional
286
- tiler.reference = 'reference.fa.gz'
287
- tiler.species = %w(mm8 rn4 hg18)
288
- # species_map is optional
289
- tiler.species_map = {
290
- 'mm8' => 'mouse',
291
- 'rn4' => 'rat',
292
- 'hg18' => 'human'
293
- }
294
- tiler.write_fasta($stdout)
295
- end
311
+ ```ruby
312
+ require 'bio-maf'
313
+ access = Bio::MAF::Access.maf_dir('test/data')
314
+ interval = Bio::GenomicInterval.zero_based('mm8.chr7',
315
+ 80082334,
316
+ 80082468)
317
+ access.tile(interval) do |tiler|
318
+ # reference is optional
319
+ tiler.reference = 'reference.fa.gz'
320
+ tiler.species = %w(mm8 rn4 hg18)
321
+ # species_map is optional
322
+ tiler.species_map = {
323
+ 'mm8' => 'mouse',
324
+ 'rn4' => 'rat',
325
+ 'hg18' => 'human'
326
+ }
327
+ tiler.write_fasta($stdout)
328
+ end
329
+ ```
296
330
 
297
331
  ### Command line tools
298
332
 
data/bin/maf_count CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
data/bin/maf_dump_blocks CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
data/bin/maf_extract ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+
7
+ include Bio::MAF
8
+
9
+ options = OpenStruct.new
10
+ options.mode = :intersect
11
+ options.format = :maf
12
+ options.seq_filter = {}
13
+ options.block_filter = {}
14
+ options.parse_options = {}
15
+
16
+ def handle_list_spec(spec)
17
+ if spec =~ /^@(.+)/
18
+ File.read($1).split
19
+ else
20
+ spec.split(',')
21
+ end
22
+ end
23
+
24
+ def handle_interval_spec(int)
25
+ parts = int.split(':')
26
+ Bio::GenomicInterval.zero_based(parts[0], parts[1].to_i, parts[2].to_i)
27
+ end
28
+
29
+ $op = OptionParser.new do |opts|
30
+ opts.banner = "Usage: maf_extract (-m MAF [-i INDEX] | -d MAFDIR) [options]"
31
+ opts.separator ""
32
+ opts.separator "MAF source options (either --maf or --maf-dir must be given):"
33
+ opts.on("-m", "--maf MAF", "MAF file") do |maf|
34
+ options.maf = maf
35
+ end
36
+ opts.on("-i", "--index INDEX", "MAF index") do |idx|
37
+ options.idx = idx
38
+ end
39
+ opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
40
+ options.maf_dir = dir
41
+ end
42
+ opts.separator ""
43
+ opts.separator "Extraction options:"
44
+ opts.on("--mode MODE", [:intersect, :slice],
45
+ "Extraction mode; 'intersect' to match ",
46
+ "blocks intersecting the given region,",
47
+ "or 'slice' to extract subsets covering ",
48
+ "given regions") do |mode|
49
+ options.mode = mode
50
+ end
51
+ opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
52
+ options.bed = bed
53
+ end
54
+ opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
55
+ options.interval = handle_interval_spec(int)
56
+ end
57
+ opts.separator ""
58
+ opts.separator "Output options:"
59
+ opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
60
+ options.format = fmt
61
+ end
62
+ opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
63
+ options.out_path = out
64
+ end
65
+ opts.separator ""
66
+ opts.separator "Filtering options:"
67
+ opts.on("--only-species SPECIES",
68
+ "Filter out all but the species in the",
69
+ "given comma-separated list",
70
+ "(or @FILE to read from a file)") do |spec|
71
+ options.seq_filter[:only_species] = handle_list_spec(spec)
72
+ end
73
+ opts.on("--with-all-species SPECIES",
74
+ "Only match blocks with all the given",
75
+ "species, comma-separated",
76
+ "(or @FILE to read from a file)") do |spec|
77
+ options.block_filter[:with_all_species] = handle_list_spec(spec)
78
+ end
79
+ opts.on("--min-sequences N", Integer,
80
+ "Match only blocks with at least N sequences") do |n|
81
+ options.block_filter[:at_least_n_sequences] = n
82
+ end
83
+ opts.on("--min-text-size N", Integer,
84
+ "Match only blocks with minimum text size N") do |n|
85
+ options.block_filter[:min_size] = n
86
+ end
87
+ opts.on("--max-text-size N", Integer,
88
+ "Match only blocks with maximum text size N") do |n|
89
+ options.block_filter[:max_size] = n
90
+ end
91
+ opts.separator ""
92
+ opts.separator "Block processing options:"
93
+ opts.on("--join-blocks",
94
+ "Join blocks if appropriate after filtering",
95
+ "out sequences") do
96
+ options.parse_options[:join_blocks] = true
97
+ end
98
+ opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
99
+ options.parse_options[:remove_gaps] = true
100
+ end
101
+ opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
102
+ options.parse_options[:parse_extended] = true
103
+ end
104
+ opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
105
+ options.parse_options[:parse_empty] = true
106
+ end
107
+ opts.separator ""
108
+ opts.separator "Logging options:"
109
+ Bio::MAF::handle_logging_options(opts)
110
+ end
111
+ $op.parse!(ARGV)
112
+ Bio::Log::CLI.configure('bio-maf')
113
+
114
+ def usage(msg)
115
+ $stderr.puts msg
116
+ $stderr.puts $op
117
+ exit 2
118
+ end
119
+
120
+ if options.maf
121
+ access = Access.file(options.maf, options.idx, options.parse_options)
122
+ elsif options.maf_dir
123
+ access = Access.maf_dir(options.maf_dir, options.parse_options)
124
+ else
125
+ usage "Must supply --maf or --maf-dir!"
126
+ end
127
+
128
+ begin
129
+ access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
130
+ access.block_filter = options.block_filter unless options.block_filter.empty?
131
+ if options.out_path
132
+ outf = File.open(options.out_path, 'w')
133
+ else
134
+ outf = $stdout
135
+ end
136
+
137
+ case options.format
138
+ when :maf
139
+ writer = Writer.new(outf)
140
+ when :fasta
141
+ writer = FASTAWriter.new(outf)
142
+ else
143
+ raise "unsupported output format #{format}!"
144
+ end
145
+
146
+ if options.bed
147
+ intervals = read_bed_intervals(options.bed)
148
+ elsif options.interval
149
+ intervals = [options.interval]
150
+ else
151
+ usage "Must supply --interval or --bed!"
152
+ end
153
+
154
+ # TODO: provide access to original MAF header?
155
+ if options.format == :maf
156
+ writer.write_header(Header.default)
157
+ end
158
+
159
+ case options.mode
160
+ when :intersect
161
+ access.find(intervals) do |block|
162
+ writer.write_block(block)
163
+ end
164
+ when :slice
165
+ # TODO: multiple files if intervals.size > 1?
166
+ intervals.each do |interval|
167
+ access.slice(interval) do |block|
168
+ writer.write_block(block)
169
+ end
170
+ end
171
+ else
172
+ raise "Unsupported mode #{options.mode}!"
173
+ end
174
+
175
+ ensure
176
+ access.close
177
+ end