bio-maf 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +2 -1
  3. data/README.md +98 -29
  4. data/Rakefile +6 -2
  5. data/bin/maf_tile +59 -35
  6. data/bio-maf.gemspec +4 -3
  7. data/features/block-joining.feature +32 -0
  8. data/features/dir-access.feature +46 -0
  9. data/features/maf-indexing.feature +23 -0
  10. data/features/maf-to-fasta.feature +9 -0
  11. data/features/slice.feature +54 -0
  12. data/features/step_definitions/dir-access_steps.rb +15 -0
  13. data/features/step_definitions/file_steps.rb +7 -0
  14. data/features/step_definitions/gap_removal_steps.rb +4 -0
  15. data/features/step_definitions/index_steps.rb +3 -3
  16. data/features/step_definitions/output_steps.rb +9 -1
  17. data/features/step_definitions/parse_steps.rb +13 -2
  18. data/features/step_definitions/query_steps.rb +7 -6
  19. data/features/step_definitions/slice_steps.rb +15 -0
  20. data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
  21. data/features/support/aruba.rb +1 -0
  22. data/features/support/env.rb +3 -1
  23. data/features/{gap-filling.feature → tiling.feature} +85 -0
  24. data/lib/bio/maf/index.rb +223 -11
  25. data/lib/bio/maf/maf.rb +209 -0
  26. data/lib/bio/maf/parser.rb +190 -111
  27. data/lib/bio/maf/tiler.rb +33 -6
  28. data/man/maf_index.1 +1 -1
  29. data/man/maf_tile.1 +7 -7
  30. data/man/maf_tile.1.ronn +21 -13
  31. data/man/maf_to_fasta.1 +1 -1
  32. data/spec/bio/maf/index_spec.rb +99 -0
  33. data/spec/bio/maf/maf_spec.rb +184 -0
  34. data/spec/bio/maf/parser_spec.rb +75 -115
  35. data/spec/bio/maf/tiler_spec.rb +44 -0
  36. data/test/data/chr22_ieq2.maf +11 -0
  37. data/test/data/gap-1.kct +0 -0
  38. data/test/data/gap-1.maf +9 -0
  39. data/test/data/gap-filled1.fa +6 -0
  40. data/test/data/gap-sp1.fa.gz +0 -0
  41. data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
  42. data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
  43. data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
  44. data/test/data/mm8_chrM_tiny.kct +0 -0
  45. data/test/data/mm8_chrM_tiny.maf +1000 -0
  46. metadata +59 -7
@@ -29,3 +29,26 @@ Feature: Indexed access to MAF files
29
29
  Then 2 blocks are obtained
30
30
  And sequence mm8.chr7 of block 0 has start 80082592
31
31
  And sequence mm8.chr7 of block 1 has start 80082713
32
+
33
+ @no_jruby
34
+ Scenario: Build MAF index with CLI tool
35
+ Given test files:
36
+ | mm8_chr7_tiny.maf |
37
+ When I run `maf_index mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
38
+ Then it should pass with:
39
+ """
40
+ """
41
+ And a file named "mm8_chr7_tiny.kct" should exist
42
+
43
+ @no_jruby
44
+ Scenario: Dump MAF index with CLI tool
45
+ Given test files:
46
+ | mm8_chr7_tiny.maf |
47
+ | mm8_chr7_tiny.kct |
48
+ When I run `maf_index -d mm8_chr7_tiny.kct`
49
+ Then it should pass with regex:
50
+ """
51
+ 0 \[bin 1195\] 80082334:80082368
52
+ """
53
+
54
+
@@ -48,3 +48,12 @@ Feature: Convert MAF file to FASTA
48
48
 
49
49
  """
50
50
 
51
+ @no_jruby
52
+ Scenario: Convert MAF to FASTA with CLI tool
53
+ Given test files:
54
+ | mm8_chr7_tiny.maf |
55
+ When I run `maf_to_fasta mm8_chr7_tiny.maf mm8_chr7_tiny.fa`
56
+ Then it should pass with:
57
+ """
58
+ """
59
+ And the file "mm8_chr7_tiny.fa" should contain ">rn4.chr1:136011785-136011819"
@@ -0,0 +1,54 @@
1
+ Feature: MAF slicing
2
+ In order to obtain just the alignment data covering a given region
3
+ I want to be able to take slices of alignment blocks over
4
+ A given interval
5
+
6
+ Scenario: Interval covering two blocks
7
+ Given a MAF source file "mm8_chr7_tiny.maf"
8
+ And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
9
+ When I open it with a MAF reader
10
+ And I enable the :remove_gaps parser option
11
+ And open a new MAF writer
12
+ And write the header from the original MAF file
13
+ And filter for only the species
14
+ | mm8 |
15
+ | rn4 |
16
+ And search for blocks between positions 80082350 and 80082380 of mm8.chr7
17
+ And slice the resulting blocks according to the given interval
18
+ And write all the matched blocks
19
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
20
+
21
+ Scenario: Interval covering two blocks, using directory access
22
+ Given indexed MAF files in "test/data"
23
+ When I enable the :remove_gaps parser option
24
+ And open a new MAF writer
25
+ And write a default header
26
+ And filter for only the species
27
+ | mm8 |
28
+ | rn4 |
29
+ And I extract a slice over the genomic interval
30
+ | chrom | start | end |
31
+ | mm8.chr7 | 80082350 | 80082380 |
32
+ And write all the matched blocks
33
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
34
+
35
+ Scenario: Interval in block subset
36
+ Given indexed MAF files in "test/data"
37
+ When I open a new MAF writer
38
+ And write a default header
39
+ And I extract a slice over the genomic interval
40
+ | chrom | start | end |
41
+ | mm8.chr7 | 80082718 | 80082728 |
42
+ And write all the matched blocks
43
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice2.maf"
44
+
45
+ Scenario: Interval to end of block
46
+ Given indexed MAF files in "test/data"
47
+ When I open a new MAF writer
48
+ And write a default header
49
+ And I extract a slice over the genomic interval
50
+ | chrom | start | end |
51
+ | mm8.chr7 | 80082757 | 80082767 |
52
+ And write all the matched blocks
53
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice3.maf"
54
+
@@ -0,0 +1,15 @@
1
+ Given /^indexed MAF files in "(.*?)"$/ do |dir|
2
+ @opts ||= {}
3
+ @access = Bio::MAF::Access.maf_dir(dir, @opts)
4
+ end
5
+
6
+ When /^I query for the genomic intervals$/ do |table|
7
+ # table is a Cucumber::Ast::Table
8
+ intervals = table.hashes.collect do |row|
9
+ Bio::GenomicInterval.zero_based(row['chrom'],
10
+ row['start'].to_i,
11
+ row['end'].to_i)
12
+ end
13
+ @access.block_filter = @block_filter
14
+ @blocks = @access.find(intervals).to_a
15
+ end
@@ -0,0 +1,7 @@
1
+ Given /^test files:$/ do |table|
2
+ Pathname.new("tmp/aruba").mkpath
3
+ table.raw.collect { |row| $test_data + row[0] }.each do |path|
4
+ $stderr.puts "staging #{path}"
5
+ system("cp #{path} tmp/aruba/")
6
+ end
7
+ end
@@ -17,3 +17,7 @@ end
17
17
  Then /^the text size of the block is (\d+)$/ do |e_text_size|
18
18
  @block.text_size.should == e_text_size.to_i
19
19
  end
20
+
21
+ Then /^the text size of block (\d+) is (\d+)$/ do |n, e_text_size|
22
+ @blocks[n.to_i].text_size.should == e_text_size.to_i
23
+ end
@@ -11,10 +11,10 @@ Then /^the index has at least (\d+) entries$/ do |size_spec|
11
11
  end
12
12
 
13
13
  When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
- int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
- @blocks = @idx.find([int], @parser, @block_filter).to_a
14
+ @interval = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([@interval], @parser, @block_filter).to_a
16
16
  end
17
17
 
18
18
  Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
- @blocks.size.should == num.to_i
19
+ @blocks.count.should == num.to_i
20
20
  end
@@ -1,4 +1,4 @@
1
- When /^open a new MAF writer$/ do
1
+ When /^(?:I )?open a new MAF writer$/ do
2
2
  @dst = Tempfile.new(["cuke", ".maf"])
3
3
  @writer = Bio::MAF::Writer.new(@dst)
4
4
  end
@@ -7,10 +7,18 @@ When /^write the header from the original MAF file$/ do
7
7
  @writer.write_header(@parser.header)
8
8
  end
9
9
 
10
+ When /^write a default header$/ do
11
+ @writer.write_header(Bio::MAF::Header.default)
12
+ end
13
+
10
14
  When /^write all the parsed blocks$/ do
11
15
  @writer.write_blocks(@parser.parse_blocks)
12
16
  end
13
17
 
18
+ When /^write all the matched blocks$/ do
19
+ @writer.write_blocks(@blocks)
20
+ end
21
+
14
22
  RSpec::Matchers.define :match_except_ws do |expected|
15
23
  match do |actual|
16
24
  system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
@@ -4,8 +4,15 @@ When /^I open it with a MAF reader$/ do
4
4
  end
5
5
 
6
6
  When /^I enable the :(\S+) parser option$/ do |opt_s|
7
- @opts ||= {}
8
- @opts[opt_s.to_sym] = true
7
+ if @parser
8
+ opts = @parser.opts
9
+ elsif @access
10
+ opts = @access.parse_options
11
+ else
12
+ @opts ||= {}
13
+ opts = @opts
14
+ end
15
+ opts[opt_s.to_sym] = true
9
16
  end
10
17
 
11
18
  Then /^the MAF version should be "(.*?)"$/ do |v_spec|
@@ -29,6 +36,10 @@ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
29
36
  @block.sequences.size.should == n_seq.to_i
30
37
  end
31
38
 
39
+ Then /^block (\d+) has (\d+) sequences$/ do |block_n, n_seq|
40
+ @blocks[block_n.to_i].sequences.size.should == n_seq.to_i
41
+ end
42
+
32
43
  Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
33
44
  method_sym = method.gsub(/ /, '_').to_sym
34
45
  @block.raw_seq(i.to_i).send(method_sym).should == str
@@ -1,20 +1,21 @@
1
- When /^filter for only the species$/ do |table|
1
+ When /^(?:I )?filter for only the species$/ do |table|
2
2
  # table is a Cucumber::Ast::Table
3
3
  sp = table.raw.collect { |row| row[0] }
4
- @parser.sequence_filter = { :only_species => sp }
4
+ thing = @access || @parser
5
+ thing.sequence_filter = { :only_species => sp }
5
6
  end
6
7
 
7
- When /^filter for blocks with the species$/ do |table|
8
+ When /^(?:I )?filter for blocks with the species$/ do |table|
8
9
  # table is a Cucumber::Ast::Table
9
10
  sp = table.raw.collect { |row| row[0] }
10
11
  @block_filter = { :with_all_species => sp }
11
12
  end
12
13
 
13
- When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ When /^(?:I )?filter for blocks with at least (\d+) sequences$/ do |n|
14
15
  @block_filter = { :at_least_n_sequences => n.to_i }
15
16
  end
16
17
 
17
- When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ When /^(?:I )?filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
19
  constraint = case op
19
20
  when 'least' then :min_size
20
21
  when 'most' then :max_size
@@ -23,7 +24,7 @@ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
23
24
  @block_filter = { constraint => len.to_i}
24
25
  end
25
26
 
26
- When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ When /^(?:I )?filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
28
  @block_filter = {
28
29
  :min_size => min.to_i,
29
30
  :max_size => max.to_i
@@ -0,0 +1,15 @@
1
+ When /^slice the resulting blocks according to the given interval$/ do
2
+ # @blocks and @interval
3
+ @blocks = @blocks.collect { |b| b.slice(@interval) }
4
+ end
5
+
6
+ When /^I extract a slice over the genomic interval$/ do |table|
7
+ # table is a Cucumber::Ast::Table
8
+ intervals = table.hashes.collect do |row|
9
+ Bio::GenomicInterval.zero_based(row['chrom'],
10
+ row['start'].to_i,
11
+ row['end'].to_i)
12
+ end
13
+ intervals.size.should == 1
14
+ @blocks = @access.slice(intervals[0])
15
+ end
@@ -0,0 +1 @@
1
+ require 'aruba/cucumber'
@@ -9,7 +9,9 @@ end
9
9
  require 'pathname'
10
10
  require 'tempfile'
11
11
 
12
- $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
12
+ lib_dir = File.expand_path('../../../lib', __FILE__)
13
+ $LOAD_PATH << lib_dir
14
+ ENV['RUBYLIB'] = lib_dir
13
15
 
14
16
  require 'bio-maf'
15
17
 
@@ -154,5 +154,90 @@ Feature: Join alignment blocks with reference data
154
154
  **********AGGTTTAGGG******************************
155
155
  """
156
156
 
157
+ @no_jruby
158
+ Scenario: Tile with CLI tool and reference seq
159
+ Given test files:
160
+ | gap-sp1.fa.gz |
161
+ | gap-1.maf |
162
+ | gap-1.kct |
163
+ When I run `maf_tile --reference gap-sp1.fa.gz --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
164
+ Then it should pass with:
165
+ """
166
+ >mouse
167
+ CCAGGATGCTGGGCTGAGGGC--AGTTGTGTCAGGGCGGTCCGGTGCAGGCA
168
+ >nautilus
169
+ **********GGGCTGACGGC--AG*******AGGGCGGTGC**********
170
+ >jaguar
171
+ **********AGGTTTAGGGCAGAG***************************
172
+ """
173
+
174
+ @no_jruby
175
+ Scenario: Tile with CLI tool and no reference seq
176
+ Given test files:
177
+ | gap-1.maf |
178
+ | gap-1.kct |
179
+ When I run `maf_tile --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
180
+ Then it should pass with:
181
+ """
182
+ >mouse
183
+ NNNNNNNNNNGGGCTGAGGGC--AGNNNNNNNAGGGCGGTCCNNNNNNNNNN
184
+ >nautilus
185
+ **********GGGCTGACGGC--AG*******AGGGCGGTGC**********
186
+ >jaguar
187
+ **********AGGTTTAGGGCAGAG***************************
188
+ """
189
+
190
+ @no_jruby
191
+ Scenario: Tile with CLI tool and BED intervals
192
+ Given test files:
193
+ | gap-1.maf |
194
+ | gap-1.kct |
195
+ | gap-sp1.fa.gz |
196
+ And a file named "example.bed" with:
197
+ """
198
+ sp1.chr1 12 36
199
+ """
200
+ When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
201
+ Then the file "selected_12-36.fa" should contain exactly:
202
+ """
203
+ >mouse
204
+ GCTGAGGGC--AGTTGTGTCAGGGCG
205
+ >nautilus
206
+ GCTGACGGC--AG*******AGGGCG
207
+ >jaguar
208
+ GTTTAGGGCAGAG*************
209
+
210
+ """
157
211
 
212
+ @no_jruby
213
+ Scenario: Tile with CLI tool and implicit index
214
+ Given test files:
215
+ | mm8_chr7_tiny.maf |
216
+ | mm8_chr7_tiny.kct |
217
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334:80082344 mm8_chr7_tiny.maf`
218
+ Then it should pass with:
219
+ """
220
+ >mm8
221
+ GGGCTGAGGG
222
+ >rn4
223
+ GGGCTGAGGG
224
+ >hg18
225
+ --------GG
226
+ """
227
+
228
+ @no_jruby
229
+ Scenario: Tile with CLI tool and directory
230
+ Given test files:
231
+ | mm8_chr7_tiny.maf |
232
+ | mm8_chr7_tiny.kct |
233
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334:80082344 .`
234
+ Then it should pass with:
235
+ """
236
+ >mm8
237
+ GGGCTGAGGG
238
+ >rn4
239
+ GGGCTGAGGG
240
+ >hg18
241
+ --------GG
242
+ """
158
243
 
data/lib/bio/maf/index.rb CHANGED
@@ -61,6 +61,204 @@ module Bio
61
61
  end
62
62
  end
63
63
 
64
+ # Top-level class for working with a set of indexed MAF
65
+ # files. Provides a higher-level alternative to working with
66
+ # {Parser} and {KyotoIndex} objects directly.
67
+ #
68
+ # Instantiate with {Access.maf_dir} and {Access.file} methods.
69
+ class Access
70
+
71
+ # Parser options.
72
+ # @return [Hash]
73
+ # @see Parser
74
+ attr_accessor :parse_options
75
+ # Sequence filter to apply.
76
+ # @return [Hash]
77
+ # @see Parser#sequence_filter
78
+ attr_accessor :sequence_filter
79
+ # Block filter to apply.
80
+ # @return [Hash]
81
+ # @see KyotoIndex#find
82
+ attr_accessor :block_filter
83
+ attr_reader :indices
84
+
85
+ # Provides access to a directory of indexed MAF files. Any files
86
+ # with .maf suffixes and accompanying .kct indexes in the given
87
+ # directory will be accessible.
88
+ # @param [String] dir directory to scan
89
+ # @param [Hash] options parser options
90
+ # @return [Access]
91
+ def self.maf_dir(dir, options={})
92
+ o = options.dup
93
+ o[:dir] = dir
94
+ self.new(o)
95
+ end
96
+
97
+ # Provides access to a single MAF file. If this file is not
98
+ # indexed, it will be fully parsed to create a temporary
99
+ # in-memory index. For large MAF files or ones which will be
100
+ # used multiple times, this is inefficient, and an index file
101
+ # should be created with maf_index(1).
102
+ #
103
+ # @param [String] maf path to MAF file
104
+ # @param [String] index Kyoto Cabinet index file
105
+ # @param [Hash] options parser options
106
+ # @return [Access]
107
+ def self.file(maf, index=nil, options={})
108
+ o = options.dup
109
+ o[:maf] = maf
110
+ o[:index] = index if index
111
+ self.new(o)
112
+ end
113
+
114
+ # Close all open resources, in particular Kyoto Cabinet database
115
+ # handles.
116
+ def close
117
+ @indices.values.each { |ki| ki.close }
118
+ end
119
+
120
+ # Find all alignment blocks in the genomic regions in the list
121
+ # of Bio::GenomicInterval objects, and parse them with the given
122
+ # parser.
123
+ #
124
+ # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
125
+ # intervals to parse.
126
+ # @yield [block] each {Block} matched, in turn
127
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
128
+ # @api public
129
+ # @see KyotoIndex#find
130
+ def find(intervals, &blk)
131
+ if block_given?
132
+ by_chrom = intervals.group_by { |i| i.chrom }
133
+ by_chrom.keys.each do |chrom|
134
+ unless @indices.has_key? chrom
135
+ raise "No index available for chromosome #{chrom}!"
136
+ end
137
+ end
138
+ by_chrom.each do |chrom, c_intervals|
139
+ index = @indices[chrom]
140
+ with_parser(chrom) do |parser|
141
+ index.find(c_intervals, parser, block_filter, &blk)
142
+ end
143
+ end
144
+ else
145
+ enum_for(:find, intervals)
146
+ end
147
+ end
148
+
149
+ # Find and parse all alignment blocks in the genomic region
150
+ # given by a Bio::GenomicInterval, and combine them to
151
+ # synthesize a single alignment covering that interval
152
+ # exactly.
153
+ #
154
+ # @param [Bio::GenomicInterval] interval interval to search
155
+ # @yield [tiler] a {Tiler} ready to operate on the given interval
156
+ # @api public
157
+ def tile(interval)
158
+ index = chrom_index(interval.chrom)
159
+ with_parser(interval.chrom) do |parser|
160
+ tiler = Tiler.new
161
+ tiler.index = index
162
+ tiler.parser = parser
163
+ tiler.interval = interval
164
+ yield tiler
165
+ end
166
+ end
167
+
168
+ # Find and parse all alignment blocks in the genomic region
169
+ # given by a Bio::GenomicInterval, and truncate them to just the
170
+ # region intersecting that interval.
171
+ #
172
+ # @param [Bio::GenomicInterval] interval interval to search
173
+ # @yield [block] each {Block} matched, in turn
174
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
175
+ # @api public
176
+ # @see KyotoIndex#slice
177
+ def slice(interval, &blk)
178
+ index = chrom_index(interval.chrom)
179
+ with_parser(interval.chrom) do |parser|
180
+ index.slice(interval, parser, &blk)
181
+ end
182
+ end
183
+
184
+ #### Internals
185
+
186
+ # @api private
187
+ def initialize(options)
188
+ @parse_options = options
189
+ @indices = {}
190
+ @maf_by_chrom = {}
191
+ if options[:dir]
192
+ @dir = options[:dir]
193
+ @maf_files = Dir.glob("#{@dir}/*.maf")
194
+ elsif options[:maf]
195
+ @maf_files = [options[:maf]]
196
+ if options[:index]
197
+ register_index(KyotoIndex.open(options[:index]),
198
+ options[:maf])
199
+ end
200
+ else
201
+ raise "Must specify :dir or :maf!"
202
+ end
203
+ scan_indices!
204
+ if options[:maf] && @indices.empty?
205
+ # MAF file explicitly given but no index
206
+ # build a temporary one
207
+ # (could build a real one, too...)
208
+ maf = options[:maf]
209
+ parser = Parser.new(maf, @parse_options)
210
+ # $stderr.puts "WARNING: building temporary index on #{maf}."
211
+ index = KyotoIndex.build(parser, '%')
212
+ register_index(index, maf)
213
+ end
214
+ end
215
+
216
+ # @api private
217
+ def find_index_file(maf)
218
+ base = File.basename(maf, '.maf')
219
+ index_f = "#{@dir}/#{base}.kct"
220
+ File.exists?(index_f) ? index_f : nil
221
+ end
222
+
223
+ # @api private
224
+ def register_index(index, maf)
225
+ @indices[index.ref_seq] = index
226
+ @maf_by_chrom[index.ref_seq] = maf
227
+ end
228
+
229
+ # @api private
230
+ def scan_indices!
231
+ @maf_files.each do |maf|
232
+ index_f = find_index_file(maf)
233
+ if index_f
234
+ index = KyotoIndex.open(index_f)
235
+ register_index(index, maf)
236
+ end
237
+ end
238
+ end
239
+
240
+ # @api private
241
+ def chrom_index(chrom)
242
+ unless @indices.has_key? chrom
243
+ raise "No index available for chromosome #{chrom}!"
244
+ end
245
+ @indices[chrom]
246
+ end
247
+
248
+ # @api private
249
+ def with_parser(chrom)
250
+ # $stderr.puts "Creating parser with options #{@parse_options.inspect}"
251
+ parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
252
+ parser.sequence_filter = self.sequence_filter
253
+ begin
254
+ yield parser
255
+ ensure
256
+ parser.close
257
+ end
258
+ end
259
+
260
+ end
261
+
64
262
  class KyotoIndex
65
263
  include KVHelpers
66
264
 
@@ -189,15 +387,22 @@ module Bio
189
387
  # @param [Parser] parser MAF parser for file to fetch blocks
190
388
  # from.
191
389
  # @param [Hash] filter Block filter expression.
192
- # @return [Array<Block>]
390
+ # @yield [block] each {Block} matched, in turn
391
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
193
392
  # @api public
194
- def find(intervals, parser, filter={})
195
- start = Time.now
393
+ def find(intervals, parser, filter={}, &blk)
394
+ # start = Time.now
196
395
  fl = fetch_list(intervals, filter)
197
- $stderr.printf("Built fetch list of %d items in %.3fs.\n",
198
- fl.size,
199
- Time.now - start)
200
- parser.fetch_blocks(fl)
396
+ # $stderr.printf("Built fetch list of %d items in %.3fs.\n",
397
+ # fl.size,
398
+ # Time.now - start)
399
+ if ! fl.empty?
400
+ parser.fetch_blocks(fl, &blk)
401
+ else
402
+ if ! block_given?
403
+ []
404
+ end
405
+ end
201
406
  end
202
407
 
203
408
  # Close the underlying Kyoto Cabinet database handle.
@@ -205,6 +410,16 @@ module Bio
205
410
  db.close
206
411
  end
207
412
 
413
+ def slice(interval, parser, filter={})
414
+ if block_given?
415
+ find([interval], parser, filter) do |block|
416
+ yield block.slice(interval)
417
+ end
418
+ else
419
+ enum_for(:slice, interval, parser, filter)
420
+ end
421
+ end
422
+
208
423
  #### KyotoIndex Internals
209
424
  # @api private
210
425
 
@@ -288,7 +503,6 @@ module Bio
288
503
  # Build a fetch list of alignment blocks to read, given an array
289
504
  # of Bio::GenomicInterval objects
290
505
  def fetch_list(intervals, filter_spec={})
291
- start = Time.now
292
506
  filter_spec ||= {}
293
507
  filters = Filters.build(filter_spec, self)
294
508
  chrom = intervals.first.chrom
@@ -309,9 +523,7 @@ module Bio
309
523
  bin_intervals.values.each do |intervals|
310
524
  intervals.sort_by! {|i| i.begin}
311
525
  end
312
- ready = Time.now
313
- $stderr.puts "bin intervals computed after #{ready - start} seconds."
314
- matches = if RUBY_PLATFORM == 'java'
526
+ matches = if RUBY_PLATFORM == 'java' && bin_intervals.size > 4
315
527
  scan_bins_parallel(chrom_id, bin_intervals, filters)
316
528
  else
317
529
  scan_bins(chrom_id, bin_intervals, filters)