bio-maf 0.2.0-java → 0.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +3 -1
  3. data/README.md +98 -29
  4. data/Rakefile +6 -2
  5. data/bin/maf_tile +59 -35
  6. data/bio-maf.gemspec +4 -3
  7. data/features/block-joining.feature +32 -0
  8. data/features/dir-access.feature +46 -0
  9. data/features/maf-indexing.feature +23 -0
  10. data/features/maf-to-fasta.feature +9 -0
  11. data/features/slice.feature +54 -0
  12. data/features/step_definitions/dir-access_steps.rb +15 -0
  13. data/features/step_definitions/file_steps.rb +7 -0
  14. data/features/step_definitions/gap_removal_steps.rb +4 -0
  15. data/features/step_definitions/index_steps.rb +3 -3
  16. data/features/step_definitions/output_steps.rb +9 -1
  17. data/features/step_definitions/parse_steps.rb +13 -2
  18. data/features/step_definitions/query_steps.rb +7 -6
  19. data/features/step_definitions/slice_steps.rb +15 -0
  20. data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
  21. data/features/support/aruba.rb +1 -0
  22. data/features/support/env.rb +3 -1
  23. data/features/{gap-filling.feature → tiling.feature} +85 -0
  24. data/lib/bio/maf/index.rb +223 -11
  25. data/lib/bio/maf/maf.rb +209 -0
  26. data/lib/bio/maf/parser.rb +190 -111
  27. data/lib/bio/maf/tiler.rb +33 -6
  28. data/man/maf_index.1 +1 -1
  29. data/man/maf_tile.1 +7 -7
  30. data/man/maf_tile.1.ronn +21 -13
  31. data/man/maf_to_fasta.1 +1 -1
  32. data/spec/bio/maf/index_spec.rb +99 -0
  33. data/spec/bio/maf/maf_spec.rb +184 -0
  34. data/spec/bio/maf/parser_spec.rb +75 -115
  35. data/spec/bio/maf/tiler_spec.rb +44 -0
  36. data/test/data/chr22_ieq2.maf +11 -0
  37. data/test/data/gap-1.kct +0 -0
  38. data/test/data/gap-1.maf +9 -0
  39. data/test/data/gap-filled1.fa +6 -0
  40. data/test/data/gap-sp1.fa.gz +0 -0
  41. data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
  42. data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
  43. data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
  44. data/test/data/mm8_chrM_tiny.kct +0 -0
  45. data/test/data/mm8_chrM_tiny.maf +1000 -0
  46. metadata +65 -16
@@ -29,3 +29,26 @@ Feature: Indexed access to MAF files
29
29
  Then 2 blocks are obtained
30
30
  And sequence mm8.chr7 of block 0 has start 80082592
31
31
  And sequence mm8.chr7 of block 1 has start 80082713
32
+
33
+ @no_jruby
34
+ Scenario: Build MAF index with CLI tool
35
+ Given test files:
36
+ | mm8_chr7_tiny.maf |
37
+ When I run `maf_index mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
38
+ Then it should pass with:
39
+ """
40
+ """
41
+ And a file named "mm8_chr7_tiny.kct" should exist
42
+
43
+ @no_jruby
44
+ Scenario: Dump MAF index with CLI tool
45
+ Given test files:
46
+ | mm8_chr7_tiny.maf |
47
+ | mm8_chr7_tiny.kct |
48
+ When I run `maf_index -d mm8_chr7_tiny.kct`
49
+ Then it should pass with regex:
50
+ """
51
+ 0 \[bin 1195\] 80082334:80082368
52
+ """
53
+
54
+
@@ -48,3 +48,12 @@ Feature: Convert MAF file to FASTA
48
48
 
49
49
  """
50
50
 
51
+ @no_jruby
52
+ Scenario: Convert MAF to FASTA with CLI tool
53
+ Given test files:
54
+ | mm8_chr7_tiny.maf |
55
+ When I run `maf_to_fasta mm8_chr7_tiny.maf mm8_chr7_tiny.fa`
56
+ Then it should pass with:
57
+ """
58
+ """
59
+ And the file "mm8_chr7_tiny.fa" should contain ">rn4.chr1:136011785-136011819"
@@ -0,0 +1,54 @@
1
+ Feature: MAF slicing
2
+ In order to obtain just the alignment data covering a given region
3
+ I want to be able to take slices of alignment blocks over
4
+ A given interval
5
+
6
+ Scenario: Interval covering two blocks
7
+ Given a MAF source file "mm8_chr7_tiny.maf"
8
+ And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
9
+ When I open it with a MAF reader
10
+ And I enable the :remove_gaps parser option
11
+ And open a new MAF writer
12
+ And write the header from the original MAF file
13
+ And filter for only the species
14
+ | mm8 |
15
+ | rn4 |
16
+ And search for blocks between positions 80082350 and 80082380 of mm8.chr7
17
+ And slice the resulting blocks according to the given interval
18
+ And write all the matched blocks
19
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
20
+
21
+ Scenario: Interval covering two blocks, using directory access
22
+ Given indexed MAF files in "test/data"
23
+ When I enable the :remove_gaps parser option
24
+ And open a new MAF writer
25
+ And write a default header
26
+ And filter for only the species
27
+ | mm8 |
28
+ | rn4 |
29
+ And I extract a slice over the genomic interval
30
+ | chrom | start | end |
31
+ | mm8.chr7 | 80082350 | 80082380 |
32
+ And write all the matched blocks
33
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
34
+
35
+ Scenario: Interval in block subset
36
+ Given indexed MAF files in "test/data"
37
+ When I open a new MAF writer
38
+ And write a default header
39
+ And I extract a slice over the genomic interval
40
+ | chrom | start | end |
41
+ | mm8.chr7 | 80082718 | 80082728 |
42
+ And write all the matched blocks
43
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice2.maf"
44
+
45
+ Scenario: Interval to end of block
46
+ Given indexed MAF files in "test/data"
47
+ When I open a new MAF writer
48
+ And write a default header
49
+ And I extract a slice over the genomic interval
50
+ | chrom | start | end |
51
+ | mm8.chr7 | 80082757 | 80082767 |
52
+ And write all the matched blocks
53
+ Then the output should match, except whitespace, "mm8_chr7_tiny_slice3.maf"
54
+
@@ -0,0 +1,15 @@
1
+ Given /^indexed MAF files in "(.*?)"$/ do |dir|
2
+ @opts ||= {}
3
+ @access = Bio::MAF::Access.maf_dir(dir, @opts)
4
+ end
5
+
6
+ When /^I query for the genomic intervals$/ do |table|
7
+ # table is a Cucumber::Ast::Table
8
+ intervals = table.hashes.collect do |row|
9
+ Bio::GenomicInterval.zero_based(row['chrom'],
10
+ row['start'].to_i,
11
+ row['end'].to_i)
12
+ end
13
+ @access.block_filter = @block_filter
14
+ @blocks = @access.find(intervals).to_a
15
+ end
@@ -0,0 +1,7 @@
1
+ Given /^test files:$/ do |table|
2
+ Pathname.new("tmp/aruba").mkpath
3
+ table.raw.collect { |row| $test_data + row[0] }.each do |path|
4
+ $stderr.puts "staging #{path}"
5
+ system("cp #{path} tmp/aruba/")
6
+ end
7
+ end
@@ -17,3 +17,7 @@ end
17
17
  Then /^the text size of the block is (\d+)$/ do |e_text_size|
18
18
  @block.text_size.should == e_text_size.to_i
19
19
  end
20
+
21
+ Then /^the text size of block (\d+) is (\d+)$/ do |n, e_text_size|
22
+ @blocks[n.to_i].text_size.should == e_text_size.to_i
23
+ end
@@ -11,10 +11,10 @@ Then /^the index has at least (\d+) entries$/ do |size_spec|
11
11
  end
12
12
 
13
13
  When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
- int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
- @blocks = @idx.find([int], @parser, @block_filter).to_a
14
+ @interval = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([@interval], @parser, @block_filter).to_a
16
16
  end
17
17
 
18
18
  Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
- @blocks.size.should == num.to_i
19
+ @blocks.count.should == num.to_i
20
20
  end
@@ -1,4 +1,4 @@
1
- When /^open a new MAF writer$/ do
1
+ When /^(?:I )?open a new MAF writer$/ do
2
2
  @dst = Tempfile.new(["cuke", ".maf"])
3
3
  @writer = Bio::MAF::Writer.new(@dst)
4
4
  end
@@ -7,10 +7,18 @@ When /^write the header from the original MAF file$/ do
7
7
  @writer.write_header(@parser.header)
8
8
  end
9
9
 
10
+ When /^write a default header$/ do
11
+ @writer.write_header(Bio::MAF::Header.default)
12
+ end
13
+
10
14
  When /^write all the parsed blocks$/ do
11
15
  @writer.write_blocks(@parser.parse_blocks)
12
16
  end
13
17
 
18
+ When /^write all the matched blocks$/ do
19
+ @writer.write_blocks(@blocks)
20
+ end
21
+
14
22
  RSpec::Matchers.define :match_except_ws do |expected|
15
23
  match do |actual|
16
24
  system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
@@ -4,8 +4,15 @@ When /^I open it with a MAF reader$/ do
4
4
  end
5
5
 
6
6
  When /^I enable the :(\S+) parser option$/ do |opt_s|
7
- @opts ||= {}
8
- @opts[opt_s.to_sym] = true
7
+ if @parser
8
+ opts = @parser.opts
9
+ elsif @access
10
+ opts = @access.parse_options
11
+ else
12
+ @opts ||= {}
13
+ opts = @opts
14
+ end
15
+ opts[opt_s.to_sym] = true
9
16
  end
10
17
 
11
18
  Then /^the MAF version should be "(.*?)"$/ do |v_spec|
@@ -29,6 +36,10 @@ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
29
36
  @block.sequences.size.should == n_seq.to_i
30
37
  end
31
38
 
39
+ Then /^block (\d+) has (\d+) sequences$/ do |block_n, n_seq|
40
+ @blocks[block_n.to_i].sequences.size.should == n_seq.to_i
41
+ end
42
+
32
43
  Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
33
44
  method_sym = method.gsub(/ /, '_').to_sym
34
45
  @block.raw_seq(i.to_i).send(method_sym).should == str
@@ -1,20 +1,21 @@
1
- When /^filter for only the species$/ do |table|
1
+ When /^(?:I )?filter for only the species$/ do |table|
2
2
  # table is a Cucumber::Ast::Table
3
3
  sp = table.raw.collect { |row| row[0] }
4
- @parser.sequence_filter = { :only_species => sp }
4
+ thing = @access || @parser
5
+ thing.sequence_filter = { :only_species => sp }
5
6
  end
6
7
 
7
- When /^filter for blocks with the species$/ do |table|
8
+ When /^(?:I )?filter for blocks with the species$/ do |table|
8
9
  # table is a Cucumber::Ast::Table
9
10
  sp = table.raw.collect { |row| row[0] }
10
11
  @block_filter = { :with_all_species => sp }
11
12
  end
12
13
 
13
- When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ When /^(?:I )?filter for blocks with at least (\d+) sequences$/ do |n|
14
15
  @block_filter = { :at_least_n_sequences => n.to_i }
15
16
  end
16
17
 
17
- When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ When /^(?:I )?filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
19
  constraint = case op
19
20
  when 'least' then :min_size
20
21
  when 'most' then :max_size
@@ -23,7 +24,7 @@ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
23
24
  @block_filter = { constraint => len.to_i}
24
25
  end
25
26
 
26
- When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ When /^(?:I )?filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
28
  @block_filter = {
28
29
  :min_size => min.to_i,
29
30
  :max_size => max.to_i
@@ -0,0 +1,15 @@
1
+ When /^slice the resulting blocks according to the given interval$/ do
2
+ # @blocks and @interval
3
+ @blocks = @blocks.collect { |b| b.slice(@interval) }
4
+ end
5
+
6
+ When /^I extract a slice over the genomic interval$/ do |table|
7
+ # table is a Cucumber::Ast::Table
8
+ intervals = table.hashes.collect do |row|
9
+ Bio::GenomicInterval.zero_based(row['chrom'],
10
+ row['start'].to_i,
11
+ row['end'].to_i)
12
+ end
13
+ intervals.size.should == 1
14
+ @blocks = @access.slice(intervals[0])
15
+ end
@@ -0,0 +1 @@
1
+ require 'aruba/cucumber'
@@ -9,7 +9,9 @@ end
9
9
  require 'pathname'
10
10
  require 'tempfile'
11
11
 
12
- $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
12
+ lib_dir = File.expand_path('../../../lib', __FILE__)
13
+ $LOAD_PATH << lib_dir
14
+ ENV['RUBYLIB'] = lib_dir
13
15
 
14
16
  require 'bio-maf'
15
17
 
@@ -154,5 +154,90 @@ Feature: Join alignment blocks with reference data
154
154
  **********AGGTTTAGGG******************************
155
155
  """
156
156
 
157
+ @no_jruby
158
+ Scenario: Tile with CLI tool and reference seq
159
+ Given test files:
160
+ | gap-sp1.fa.gz |
161
+ | gap-1.maf |
162
+ | gap-1.kct |
163
+ When I run `maf_tile --reference gap-sp1.fa.gz --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
164
+ Then it should pass with:
165
+ """
166
+ >mouse
167
+ CCAGGATGCTGGGCTGAGGGC--AGTTGTGTCAGGGCGGTCCGGTGCAGGCA
168
+ >nautilus
169
+ **********GGGCTGACGGC--AG*******AGGGCGGTGC**********
170
+ >jaguar
171
+ **********AGGTTTAGGGCAGAG***************************
172
+ """
173
+
174
+ @no_jruby
175
+ Scenario: Tile with CLI tool and no reference seq
176
+ Given test files:
177
+ | gap-1.maf |
178
+ | gap-1.kct |
179
+ When I run `maf_tile --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
180
+ Then it should pass with:
181
+ """
182
+ >mouse
183
+ NNNNNNNNNNGGGCTGAGGGC--AGNNNNNNNAGGGCGGTCCNNNNNNNNNN
184
+ >nautilus
185
+ **********GGGCTGACGGC--AG*******AGGGCGGTGC**********
186
+ >jaguar
187
+ **********AGGTTTAGGGCAGAG***************************
188
+ """
189
+
190
+ @no_jruby
191
+ Scenario: Tile with CLI tool and BED intervals
192
+ Given test files:
193
+ | gap-1.maf |
194
+ | gap-1.kct |
195
+ | gap-sp1.fa.gz |
196
+ And a file named "example.bed" with:
197
+ """
198
+ sp1.chr1 12 36
199
+ """
200
+ When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
201
+ Then the file "selected_12-36.fa" should contain exactly:
202
+ """
203
+ >mouse
204
+ GCTGAGGGC--AGTTGTGTCAGGGCG
205
+ >nautilus
206
+ GCTGACGGC--AG*******AGGGCG
207
+ >jaguar
208
+ GTTTAGGGCAGAG*************
209
+
210
+ """
157
211
 
212
+ @no_jruby
213
+ Scenario: Tile with CLI tool and implicit index
214
+ Given test files:
215
+ | mm8_chr7_tiny.maf |
216
+ | mm8_chr7_tiny.kct |
217
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334:80082344 mm8_chr7_tiny.maf`
218
+ Then it should pass with:
219
+ """
220
+ >mm8
221
+ GGGCTGAGGG
222
+ >rn4
223
+ GGGCTGAGGG
224
+ >hg18
225
+ --------GG
226
+ """
227
+
228
+ @no_jruby
229
+ Scenario: Tile with CLI tool and directory
230
+ Given test files:
231
+ | mm8_chr7_tiny.maf |
232
+ | mm8_chr7_tiny.kct |
233
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334:80082344 .`
234
+ Then it should pass with:
235
+ """
236
+ >mm8
237
+ GGGCTGAGGG
238
+ >rn4
239
+ GGGCTGAGGG
240
+ >hg18
241
+ --------GG
242
+ """
158
243
 
data/lib/bio/maf/index.rb CHANGED
@@ -61,6 +61,204 @@ module Bio
61
61
  end
62
62
  end
63
63
 
64
+ # Top-level class for working with a set of indexed MAF
65
+ # files. Provides a higher-level alternative to working with
66
+ # {Parser} and {KyotoIndex} objects directly.
67
+ #
68
+ # Instantiate with {Access.maf_dir} and {Access.file} methods.
69
+ class Access
70
+
71
+ # Parser options.
72
+ # @return [Hash]
73
+ # @see Parser
74
+ attr_accessor :parse_options
75
+ # Sequence filter to apply.
76
+ # @return [Hash]
77
+ # @see Parser#sequence_filter
78
+ attr_accessor :sequence_filter
79
+ # Block filter to apply.
80
+ # @return [Hash]
81
+ # @see KyotoIndex#find
82
+ attr_accessor :block_filter
83
+ attr_reader :indices
84
+
85
+ # Provides access to a directory of indexed MAF files. Any files
86
+ # with .maf suffixes and accompanying .kct indexes in the given
87
+ # directory will be accessible.
88
+ # @param [String] dir directory to scan
89
+ # @param [Hash] options parser options
90
+ # @return [Access]
91
+ def self.maf_dir(dir, options={})
92
+ o = options.dup
93
+ o[:dir] = dir
94
+ self.new(o)
95
+ end
96
+
97
+ # Provides access to a single MAF file. If this file is not
98
+ # indexed, it will be fully parsed to create a temporary
99
+ # in-memory index. For large MAF files or ones which will be
100
+ # used multiple times, this is inefficient, and an index file
101
+ # should be created with maf_index(1).
102
+ #
103
+ # @param [String] maf path to MAF file
104
+ # @param [String] index Kyoto Cabinet index file
105
+ # @param [Hash] options parser options
106
+ # @return [Access]
107
+ def self.file(maf, index=nil, options={})
108
+ o = options.dup
109
+ o[:maf] = maf
110
+ o[:index] = index if index
111
+ self.new(o)
112
+ end
113
+
114
+ # Close all open resources, in particular Kyoto Cabinet database
115
+ # handles.
116
+ def close
117
+ @indices.values.each { |ki| ki.close }
118
+ end
119
+
120
+ # Find all alignment blocks in the genomic regions in the list
121
+ # of Bio::GenomicInterval objects, and parse them with the given
122
+ # parser.
123
+ #
124
+ # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
125
+ # intervals to parse.
126
+ # @yield [block] each {Block} matched, in turn
127
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
128
+ # @api public
129
+ # @see KyotoIndex#find
130
+ def find(intervals, &blk)
131
+ if block_given?
132
+ by_chrom = intervals.group_by { |i| i.chrom }
133
+ by_chrom.keys.each do |chrom|
134
+ unless @indices.has_key? chrom
135
+ raise "No index available for chromosome #{chrom}!"
136
+ end
137
+ end
138
+ by_chrom.each do |chrom, c_intervals|
139
+ index = @indices[chrom]
140
+ with_parser(chrom) do |parser|
141
+ index.find(c_intervals, parser, block_filter, &blk)
142
+ end
143
+ end
144
+ else
145
+ enum_for(:find, intervals)
146
+ end
147
+ end
148
+
149
+ # Find and parse all alignment blocks in the genomic region
150
+ # given by a Bio::GenomicInterval, and combine them to
151
+ # synthesize a single alignment covering that interval
152
+ # exactly.
153
+ #
154
+ # @param [Bio::GenomicInterval] interval interval to search
155
+ # @yield [tiler] a {Tiler} ready to operate on the given interval
156
+ # @api public
157
+ def tile(interval)
158
+ index = chrom_index(interval.chrom)
159
+ with_parser(interval.chrom) do |parser|
160
+ tiler = Tiler.new
161
+ tiler.index = index
162
+ tiler.parser = parser
163
+ tiler.interval = interval
164
+ yield tiler
165
+ end
166
+ end
167
+
168
+ # Find and parse all alignment blocks in the genomic region
169
+ # given by a Bio::GenomicInterval, and truncate them to just the
170
+ # region intersecting that interval.
171
+ #
172
+ # @param [Bio::GenomicInterval] interval interval to search
173
+ # @yield [block] each {Block} matched, in turn
174
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
175
+ # @api public
176
+ # @see KyotoIndex#slice
177
+ def slice(interval, &blk)
178
+ index = chrom_index(interval.chrom)
179
+ with_parser(interval.chrom) do |parser|
180
+ index.slice(interval, parser, &blk)
181
+ end
182
+ end
183
+
184
+ #### Internals
185
+
186
+ # @api private
187
+ def initialize(options)
188
+ @parse_options = options
189
+ @indices = {}
190
+ @maf_by_chrom = {}
191
+ if options[:dir]
192
+ @dir = options[:dir]
193
+ @maf_files = Dir.glob("#{@dir}/*.maf")
194
+ elsif options[:maf]
195
+ @maf_files = [options[:maf]]
196
+ if options[:index]
197
+ register_index(KyotoIndex.open(options[:index]),
198
+ options[:maf])
199
+ end
200
+ else
201
+ raise "Must specify :dir or :maf!"
202
+ end
203
+ scan_indices!
204
+ if options[:maf] && @indices.empty?
205
+ # MAF file explicitly given but no index
206
+ # build a temporary one
207
+ # (could build a real one, too...)
208
+ maf = options[:maf]
209
+ parser = Parser.new(maf, @parse_options)
210
+ # $stderr.puts "WARNING: building temporary index on #{maf}."
211
+ index = KyotoIndex.build(parser, '%')
212
+ register_index(index, maf)
213
+ end
214
+ end
215
+
216
+ # @api private
217
+ def find_index_file(maf)
218
+ base = File.basename(maf, '.maf')
219
+ index_f = "#{@dir}/#{base}.kct"
220
+ File.exists?(index_f) ? index_f : nil
221
+ end
222
+
223
+ # @api private
224
+ def register_index(index, maf)
225
+ @indices[index.ref_seq] = index
226
+ @maf_by_chrom[index.ref_seq] = maf
227
+ end
228
+
229
+ # @api private
230
+ def scan_indices!
231
+ @maf_files.each do |maf|
232
+ index_f = find_index_file(maf)
233
+ if index_f
234
+ index = KyotoIndex.open(index_f)
235
+ register_index(index, maf)
236
+ end
237
+ end
238
+ end
239
+
240
+ # @api private
241
+ def chrom_index(chrom)
242
+ unless @indices.has_key? chrom
243
+ raise "No index available for chromosome #{chrom}!"
244
+ end
245
+ @indices[chrom]
246
+ end
247
+
248
+ # @api private
249
+ def with_parser(chrom)
250
+ # $stderr.puts "Creating parser with options #{@parse_options.inspect}"
251
+ parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
252
+ parser.sequence_filter = self.sequence_filter
253
+ begin
254
+ yield parser
255
+ ensure
256
+ parser.close
257
+ end
258
+ end
259
+
260
+ end
261
+
64
262
  class KyotoIndex
65
263
  include KVHelpers
66
264
 
@@ -189,15 +387,22 @@ module Bio
189
387
  # @param [Parser] parser MAF parser for file to fetch blocks
190
388
  # from.
191
389
  # @param [Hash] filter Block filter expression.
192
- # @return [Array<Block>]
390
+ # @yield [block] each {Block} matched, in turn
391
+ # @return [Enumerable<Block>] each matching {Block}, if no block given
193
392
  # @api public
194
- def find(intervals, parser, filter={})
195
- start = Time.now
393
+ def find(intervals, parser, filter={}, &blk)
394
+ # start = Time.now
196
395
  fl = fetch_list(intervals, filter)
197
- $stderr.printf("Built fetch list of %d items in %.3fs.\n",
198
- fl.size,
199
- Time.now - start)
200
- parser.fetch_blocks(fl)
396
+ # $stderr.printf("Built fetch list of %d items in %.3fs.\n",
397
+ # fl.size,
398
+ # Time.now - start)
399
+ if ! fl.empty?
400
+ parser.fetch_blocks(fl, &blk)
401
+ else
402
+ if ! block_given?
403
+ []
404
+ end
405
+ end
201
406
  end
202
407
 
203
408
  # Close the underlying Kyoto Cabinet database handle.
@@ -205,6 +410,16 @@ module Bio
205
410
  db.close
206
411
  end
207
412
 
413
+ def slice(interval, parser, filter={})
414
+ if block_given?
415
+ find([interval], parser, filter) do |block|
416
+ yield block.slice(interval)
417
+ end
418
+ else
419
+ enum_for(:slice, interval, parser, filter)
420
+ end
421
+ end
422
+
208
423
  #### KyotoIndex Internals
209
424
  # @api private
210
425
 
@@ -288,7 +503,6 @@ module Bio
288
503
  # Build a fetch list of alignment blocks to read, given an array
289
504
  # of Bio::GenomicInterval objects
290
505
  def fetch_list(intervals, filter_spec={})
291
- start = Time.now
292
506
  filter_spec ||= {}
293
507
  filters = Filters.build(filter_spec, self)
294
508
  chrom = intervals.first.chrom
@@ -309,9 +523,7 @@ module Bio
309
523
  bin_intervals.values.each do |intervals|
310
524
  intervals.sort_by! {|i| i.begin}
311
525
  end
312
- ready = Time.now
313
- $stderr.puts "bin intervals computed after #{ready - start} seconds."
314
- matches = if RUBY_PLATFORM == 'java'
526
+ matches = if RUBY_PLATFORM == 'java' && bin_intervals.size > 4
315
527
  scan_bins_parallel(chrom_id, bin_intervals, filters)
316
528
  else
317
529
  scan_bins(chrom_id, bin_intervals, filters)