bio-maf 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +2 -1
- data/README.md +98 -29
- data/Rakefile +6 -2
- data/bin/maf_tile +59 -35
- data/bio-maf.gemspec +4 -3
- data/features/block-joining.feature +32 -0
- data/features/dir-access.feature +46 -0
- data/features/maf-indexing.feature +23 -0
- data/features/maf-to-fasta.feature +9 -0
- data/features/slice.feature +54 -0
- data/features/step_definitions/dir-access_steps.rb +15 -0
- data/features/step_definitions/file_steps.rb +7 -0
- data/features/step_definitions/gap_removal_steps.rb +4 -0
- data/features/step_definitions/index_steps.rb +3 -3
- data/features/step_definitions/output_steps.rb +9 -1
- data/features/step_definitions/parse_steps.rb +13 -2
- data/features/step_definitions/query_steps.rb +7 -6
- data/features/step_definitions/slice_steps.rb +15 -0
- data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
- data/features/support/aruba.rb +1 -0
- data/features/support/env.rb +3 -1
- data/features/{gap-filling.feature → tiling.feature} +85 -0
- data/lib/bio/maf/index.rb +223 -11
- data/lib/bio/maf/maf.rb +209 -0
- data/lib/bio/maf/parser.rb +190 -111
- data/lib/bio/maf/tiler.rb +33 -6
- data/man/maf_index.1 +1 -1
- data/man/maf_tile.1 +7 -7
- data/man/maf_tile.1.ronn +21 -13
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +99 -0
- data/spec/bio/maf/maf_spec.rb +184 -0
- data/spec/bio/maf/parser_spec.rb +75 -115
- data/spec/bio/maf/tiler_spec.rb +44 -0
- data/test/data/chr22_ieq2.maf +11 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/gap-1.maf +9 -0
- data/test/data/gap-filled1.fa +6 -0
- data/test/data/gap-sp1.fa.gz +0 -0
- data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
- data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
- data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.maf +1000 -0
- metadata +59 -7
@@ -29,3 +29,26 @@ Feature: Indexed access to MAF files
|
|
29
29
|
Then 2 blocks are obtained
|
30
30
|
And sequence mm8.chr7 of block 0 has start 80082592
|
31
31
|
And sequence mm8.chr7 of block 1 has start 80082713
|
32
|
+
|
33
|
+
@no_jruby
|
34
|
+
Scenario: Build MAF index with CLI tool
|
35
|
+
Given test files:
|
36
|
+
| mm8_chr7_tiny.maf |
|
37
|
+
When I run `maf_index mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
|
38
|
+
Then it should pass with:
|
39
|
+
"""
|
40
|
+
"""
|
41
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
42
|
+
|
43
|
+
@no_jruby
|
44
|
+
Scenario: Dump MAF index with CLI tool
|
45
|
+
Given test files:
|
46
|
+
| mm8_chr7_tiny.maf |
|
47
|
+
| mm8_chr7_tiny.kct |
|
48
|
+
When I run `maf_index -d mm8_chr7_tiny.kct`
|
49
|
+
Then it should pass with regex:
|
50
|
+
"""
|
51
|
+
0 \[bin 1195\] 80082334:80082368
|
52
|
+
"""
|
53
|
+
|
54
|
+
|
@@ -48,3 +48,12 @@ Feature: Convert MAF file to FASTA
|
|
48
48
|
|
49
49
|
"""
|
50
50
|
|
51
|
+
@no_jruby
|
52
|
+
Scenario: Convert MAF to FASTA with CLI tool
|
53
|
+
Given test files:
|
54
|
+
| mm8_chr7_tiny.maf |
|
55
|
+
When I run `maf_to_fasta mm8_chr7_tiny.maf mm8_chr7_tiny.fa`
|
56
|
+
Then it should pass with:
|
57
|
+
"""
|
58
|
+
"""
|
59
|
+
And the file "mm8_chr7_tiny.fa" should contain ">rn4.chr1:136011785-136011819"
|
@@ -0,0 +1,54 @@
|
|
1
|
+
Feature: MAF slicing
|
2
|
+
In order to obtain just the alignment data covering a given region
|
3
|
+
I want to be able to take slices of alignment blocks over
|
4
|
+
A given interval
|
5
|
+
|
6
|
+
Scenario: Interval covering two blocks
|
7
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
8
|
+
And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
|
9
|
+
When I open it with a MAF reader
|
10
|
+
And I enable the :remove_gaps parser option
|
11
|
+
And open a new MAF writer
|
12
|
+
And write the header from the original MAF file
|
13
|
+
And filter for only the species
|
14
|
+
| mm8 |
|
15
|
+
| rn4 |
|
16
|
+
And search for blocks between positions 80082350 and 80082380 of mm8.chr7
|
17
|
+
And slice the resulting blocks according to the given interval
|
18
|
+
And write all the matched blocks
|
19
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
20
|
+
|
21
|
+
Scenario: Interval covering two blocks, using directory access
|
22
|
+
Given indexed MAF files in "test/data"
|
23
|
+
When I enable the :remove_gaps parser option
|
24
|
+
And open a new MAF writer
|
25
|
+
And write a default header
|
26
|
+
And filter for only the species
|
27
|
+
| mm8 |
|
28
|
+
| rn4 |
|
29
|
+
And I extract a slice over the genomic interval
|
30
|
+
| chrom | start | end |
|
31
|
+
| mm8.chr7 | 80082350 | 80082380 |
|
32
|
+
And write all the matched blocks
|
33
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
34
|
+
|
35
|
+
Scenario: Interval in block subset
|
36
|
+
Given indexed MAF files in "test/data"
|
37
|
+
When I open a new MAF writer
|
38
|
+
And write a default header
|
39
|
+
And I extract a slice over the genomic interval
|
40
|
+
| chrom | start | end |
|
41
|
+
| mm8.chr7 | 80082718 | 80082728 |
|
42
|
+
And write all the matched blocks
|
43
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice2.maf"
|
44
|
+
|
45
|
+
Scenario: Interval to end of block
|
46
|
+
Given indexed MAF files in "test/data"
|
47
|
+
When I open a new MAF writer
|
48
|
+
And write a default header
|
49
|
+
And I extract a slice over the genomic interval
|
50
|
+
| chrom | start | end |
|
51
|
+
| mm8.chr7 | 80082757 | 80082767 |
|
52
|
+
And write all the matched blocks
|
53
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice3.maf"
|
54
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Given /^indexed MAF files in "(.*?)"$/ do |dir|
|
2
|
+
@opts ||= {}
|
3
|
+
@access = Bio::MAF::Access.maf_dir(dir, @opts)
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^I query for the genomic intervals$/ do |table|
|
7
|
+
# table is a Cucumber::Ast::Table
|
8
|
+
intervals = table.hashes.collect do |row|
|
9
|
+
Bio::GenomicInterval.zero_based(row['chrom'],
|
10
|
+
row['start'].to_i,
|
11
|
+
row['end'].to_i)
|
12
|
+
end
|
13
|
+
@access.block_filter = @block_filter
|
14
|
+
@blocks = @access.find(intervals).to_a
|
15
|
+
end
|
@@ -11,10 +11,10 @@ Then /^the index has at least (\d+) entries$/ do |size_spec|
|
|
11
11
|
end
|
12
12
|
|
13
13
|
When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
|
14
|
-
|
15
|
-
@blocks = @idx.find([
|
14
|
+
@interval = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
|
15
|
+
@blocks = @idx.find([@interval], @parser, @block_filter).to_a
|
16
16
|
end
|
17
17
|
|
18
18
|
Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
|
19
|
-
@blocks.
|
19
|
+
@blocks.count.should == num.to_i
|
20
20
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
When /^open a new MAF writer$/ do
|
1
|
+
When /^(?:I )?open a new MAF writer$/ do
|
2
2
|
@dst = Tempfile.new(["cuke", ".maf"])
|
3
3
|
@writer = Bio::MAF::Writer.new(@dst)
|
4
4
|
end
|
@@ -7,10 +7,18 @@ When /^write the header from the original MAF file$/ do
|
|
7
7
|
@writer.write_header(@parser.header)
|
8
8
|
end
|
9
9
|
|
10
|
+
When /^write a default header$/ do
|
11
|
+
@writer.write_header(Bio::MAF::Header.default)
|
12
|
+
end
|
13
|
+
|
10
14
|
When /^write all the parsed blocks$/ do
|
11
15
|
@writer.write_blocks(@parser.parse_blocks)
|
12
16
|
end
|
13
17
|
|
18
|
+
When /^write all the matched blocks$/ do
|
19
|
+
@writer.write_blocks(@blocks)
|
20
|
+
end
|
21
|
+
|
14
22
|
RSpec::Matchers.define :match_except_ws do |expected|
|
15
23
|
match do |actual|
|
16
24
|
system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
|
@@ -4,8 +4,15 @@ When /^I open it with a MAF reader$/ do
|
|
4
4
|
end
|
5
5
|
|
6
6
|
When /^I enable the :(\S+) parser option$/ do |opt_s|
|
7
|
-
@
|
8
|
-
|
7
|
+
if @parser
|
8
|
+
opts = @parser.opts
|
9
|
+
elsif @access
|
10
|
+
opts = @access.parse_options
|
11
|
+
else
|
12
|
+
@opts ||= {}
|
13
|
+
opts = @opts
|
14
|
+
end
|
15
|
+
opts[opt_s.to_sym] = true
|
9
16
|
end
|
10
17
|
|
11
18
|
Then /^the MAF version should be "(.*?)"$/ do |v_spec|
|
@@ -29,6 +36,10 @@ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
|
|
29
36
|
@block.sequences.size.should == n_seq.to_i
|
30
37
|
end
|
31
38
|
|
39
|
+
Then /^block (\d+) has (\d+) sequences$/ do |block_n, n_seq|
|
40
|
+
@blocks[block_n.to_i].sequences.size.should == n_seq.to_i
|
41
|
+
end
|
42
|
+
|
32
43
|
Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
|
33
44
|
method_sym = method.gsub(/ /, '_').to_sym
|
34
45
|
@block.raw_seq(i.to_i).send(method_sym).should == str
|
@@ -1,20 +1,21 @@
|
|
1
|
-
When /^filter for only the species$/ do |table|
|
1
|
+
When /^(?:I )?filter for only the species$/ do |table|
|
2
2
|
# table is a Cucumber::Ast::Table
|
3
3
|
sp = table.raw.collect { |row| row[0] }
|
4
|
-
|
4
|
+
thing = @access || @parser
|
5
|
+
thing.sequence_filter = { :only_species => sp }
|
5
6
|
end
|
6
7
|
|
7
|
-
When /^filter for blocks with the species$/ do |table|
|
8
|
+
When /^(?:I )?filter for blocks with the species$/ do |table|
|
8
9
|
# table is a Cucumber::Ast::Table
|
9
10
|
sp = table.raw.collect { |row| row[0] }
|
10
11
|
@block_filter = { :with_all_species => sp }
|
11
12
|
end
|
12
13
|
|
13
|
-
When /^filter for blocks with at least (\d+) sequences$/ do |n|
|
14
|
+
When /^(?:I )?filter for blocks with at least (\d+) sequences$/ do |n|
|
14
15
|
@block_filter = { :at_least_n_sequences => n.to_i }
|
15
16
|
end
|
16
17
|
|
17
|
-
When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
|
+
When /^(?:I )?filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
19
|
constraint = case op
|
19
20
|
when 'least' then :min_size
|
20
21
|
when 'most' then :max_size
|
@@ -23,7 +24,7 @@ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
|
23
24
|
@block_filter = { constraint => len.to_i}
|
24
25
|
end
|
25
26
|
|
26
|
-
When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
|
+
When /^(?:I )?filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
28
|
@block_filter = {
|
28
29
|
:min_size => min.to_i,
|
29
30
|
:max_size => max.to_i
|
@@ -0,0 +1,15 @@
|
|
1
|
+
When /^slice the resulting blocks according to the given interval$/ do
|
2
|
+
# @blocks and @interval
|
3
|
+
@blocks = @blocks.collect { |b| b.slice(@interval) }
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^I extract a slice over the genomic interval$/ do |table|
|
7
|
+
# table is a Cucumber::Ast::Table
|
8
|
+
intervals = table.hashes.collect do |row|
|
9
|
+
Bio::GenomicInterval.zero_based(row['chrom'],
|
10
|
+
row['start'].to_i,
|
11
|
+
row['end'].to_i)
|
12
|
+
end
|
13
|
+
intervals.size.should == 1
|
14
|
+
@blocks = @access.slice(intervals[0])
|
15
|
+
end
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'aruba/cucumber'
|
data/features/support/env.rb
CHANGED
@@ -154,5 +154,90 @@ Feature: Join alignment blocks with reference data
|
|
154
154
|
**********AGGTTTAGGG******************************
|
155
155
|
"""
|
156
156
|
|
157
|
+
@no_jruby
|
158
|
+
Scenario: Tile with CLI tool and reference seq
|
159
|
+
Given test files:
|
160
|
+
| gap-sp1.fa.gz |
|
161
|
+
| gap-1.maf |
|
162
|
+
| gap-1.kct |
|
163
|
+
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
164
|
+
Then it should pass with:
|
165
|
+
"""
|
166
|
+
>mouse
|
167
|
+
CCAGGATGCTGGGCTGAGGGC--AGTTGTGTCAGGGCGGTCCGGTGCAGGCA
|
168
|
+
>nautilus
|
169
|
+
**********GGGCTGACGGC--AG*******AGGGCGGTGC**********
|
170
|
+
>jaguar
|
171
|
+
**********AGGTTTAGGGCAGAG***************************
|
172
|
+
"""
|
173
|
+
|
174
|
+
@no_jruby
|
175
|
+
Scenario: Tile with CLI tool and no reference seq
|
176
|
+
Given test files:
|
177
|
+
| gap-1.maf |
|
178
|
+
| gap-1.kct |
|
179
|
+
When I run `maf_tile --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
180
|
+
Then it should pass with:
|
181
|
+
"""
|
182
|
+
>mouse
|
183
|
+
NNNNNNNNNNGGGCTGAGGGC--AGNNNNNNNAGGGCGGTCCNNNNNNNNNN
|
184
|
+
>nautilus
|
185
|
+
**********GGGCTGACGGC--AG*******AGGGCGGTGC**********
|
186
|
+
>jaguar
|
187
|
+
**********AGGTTTAGGGCAGAG***************************
|
188
|
+
"""
|
189
|
+
|
190
|
+
@no_jruby
|
191
|
+
Scenario: Tile with CLI tool and BED intervals
|
192
|
+
Given test files:
|
193
|
+
| gap-1.maf |
|
194
|
+
| gap-1.kct |
|
195
|
+
| gap-sp1.fa.gz |
|
196
|
+
And a file named "example.bed" with:
|
197
|
+
"""
|
198
|
+
sp1.chr1 12 36
|
199
|
+
"""
|
200
|
+
When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
|
201
|
+
Then the file "selected_12-36.fa" should contain exactly:
|
202
|
+
"""
|
203
|
+
>mouse
|
204
|
+
GCTGAGGGC--AGTTGTGTCAGGGCG
|
205
|
+
>nautilus
|
206
|
+
GCTGACGGC--AG*******AGGGCG
|
207
|
+
>jaguar
|
208
|
+
GTTTAGGGCAGAG*************
|
209
|
+
|
210
|
+
"""
|
157
211
|
|
212
|
+
@no_jruby
|
213
|
+
Scenario: Tile with CLI tool and implicit index
|
214
|
+
Given test files:
|
215
|
+
| mm8_chr7_tiny.maf |
|
216
|
+
| mm8_chr7_tiny.kct |
|
217
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334:80082344 mm8_chr7_tiny.maf`
|
218
|
+
Then it should pass with:
|
219
|
+
"""
|
220
|
+
>mm8
|
221
|
+
GGGCTGAGGG
|
222
|
+
>rn4
|
223
|
+
GGGCTGAGGG
|
224
|
+
>hg18
|
225
|
+
--------GG
|
226
|
+
"""
|
227
|
+
|
228
|
+
@no_jruby
|
229
|
+
Scenario: Tile with CLI tool and directory
|
230
|
+
Given test files:
|
231
|
+
| mm8_chr7_tiny.maf |
|
232
|
+
| mm8_chr7_tiny.kct |
|
233
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334:80082344 .`
|
234
|
+
Then it should pass with:
|
235
|
+
"""
|
236
|
+
>mm8
|
237
|
+
GGGCTGAGGG
|
238
|
+
>rn4
|
239
|
+
GGGCTGAGGG
|
240
|
+
>hg18
|
241
|
+
--------GG
|
242
|
+
"""
|
158
243
|
|
data/lib/bio/maf/index.rb
CHANGED
@@ -61,6 +61,204 @@ module Bio
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
|
64
|
+
# Top-level class for working with a set of indexed MAF
|
65
|
+
# files. Provides a higher-level alternative to working with
|
66
|
+
# {Parser} and {KyotoIndex} objects directly.
|
67
|
+
#
|
68
|
+
# Instantiate with {Access.maf_dir} and {Access.file} methods.
|
69
|
+
class Access
|
70
|
+
|
71
|
+
# Parser options.
|
72
|
+
# @return [Hash]
|
73
|
+
# @see Parser
|
74
|
+
attr_accessor :parse_options
|
75
|
+
# Sequence filter to apply.
|
76
|
+
# @return [Hash]
|
77
|
+
# @see Parser#sequence_filter
|
78
|
+
attr_accessor :sequence_filter
|
79
|
+
# Block filter to apply.
|
80
|
+
# @return [Hash]
|
81
|
+
# @see KyotoIndex#find
|
82
|
+
attr_accessor :block_filter
|
83
|
+
attr_reader :indices
|
84
|
+
|
85
|
+
# Provides access to a directory of indexed MAF files. Any files
|
86
|
+
# with .maf suffixes and accompanying .kct indexes in the given
|
87
|
+
# directory will be accessible.
|
88
|
+
# @param [String] dir directory to scan
|
89
|
+
# @param [Hash] options parser options
|
90
|
+
# @return [Access]
|
91
|
+
def self.maf_dir(dir, options={})
|
92
|
+
o = options.dup
|
93
|
+
o[:dir] = dir
|
94
|
+
self.new(o)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Provides access to a single MAF file. If this file is not
|
98
|
+
# indexed, it will be fully parsed to create a temporary
|
99
|
+
# in-memory index. For large MAF files or ones which will be
|
100
|
+
# used multiple times, this is inefficient, and an index file
|
101
|
+
# should be created with maf_index(1).
|
102
|
+
#
|
103
|
+
# @param [String] maf path to MAF file
|
104
|
+
# @param [String] index Kyoto Cabinet index file
|
105
|
+
# @param [Hash] options parser options
|
106
|
+
# @return [Access]
|
107
|
+
def self.file(maf, index=nil, options={})
|
108
|
+
o = options.dup
|
109
|
+
o[:maf] = maf
|
110
|
+
o[:index] = index if index
|
111
|
+
self.new(o)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Close all open resources, in particular Kyoto Cabinet database
|
115
|
+
# handles.
|
116
|
+
def close
|
117
|
+
@indices.values.each { |ki| ki.close }
|
118
|
+
end
|
119
|
+
|
120
|
+
# Find all alignment blocks in the genomic regions in the list
|
121
|
+
# of Bio::GenomicInterval objects, and parse them with the given
|
122
|
+
# parser.
|
123
|
+
#
|
124
|
+
# @param [Enumerable<Bio::GenomicInterval>] intervals genomic
|
125
|
+
# intervals to parse.
|
126
|
+
# @yield [block] each {Block} matched, in turn
|
127
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
128
|
+
# @api public
|
129
|
+
# @see KyotoIndex#find
|
130
|
+
def find(intervals, &blk)
|
131
|
+
if block_given?
|
132
|
+
by_chrom = intervals.group_by { |i| i.chrom }
|
133
|
+
by_chrom.keys.each do |chrom|
|
134
|
+
unless @indices.has_key? chrom
|
135
|
+
raise "No index available for chromosome #{chrom}!"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
by_chrom.each do |chrom, c_intervals|
|
139
|
+
index = @indices[chrom]
|
140
|
+
with_parser(chrom) do |parser|
|
141
|
+
index.find(c_intervals, parser, block_filter, &blk)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
else
|
145
|
+
enum_for(:find, intervals)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Find and parse all alignment blocks in the genomic region
|
150
|
+
# given by a Bio::GenomicInterval, and combine them to
|
151
|
+
# synthesize a single alignment covering that interval
|
152
|
+
# exactly.
|
153
|
+
#
|
154
|
+
# @param [Bio::GenomicInterval] interval interval to search
|
155
|
+
# @yield [tiler] a {Tiler} ready to operate on the given interval
|
156
|
+
# @api public
|
157
|
+
def tile(interval)
|
158
|
+
index = chrom_index(interval.chrom)
|
159
|
+
with_parser(interval.chrom) do |parser|
|
160
|
+
tiler = Tiler.new
|
161
|
+
tiler.index = index
|
162
|
+
tiler.parser = parser
|
163
|
+
tiler.interval = interval
|
164
|
+
yield tiler
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Find and parse all alignment blocks in the genomic region
|
169
|
+
# given by a Bio::GenomicInterval, and truncate them to just the
|
170
|
+
# region intersecting that interval.
|
171
|
+
#
|
172
|
+
# @param [Bio::GenomicInterval] interval interval to search
|
173
|
+
# @yield [block] each {Block} matched, in turn
|
174
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
175
|
+
# @api public
|
176
|
+
# @see KyotoIndex#slice
|
177
|
+
def slice(interval, &blk)
|
178
|
+
index = chrom_index(interval.chrom)
|
179
|
+
with_parser(interval.chrom) do |parser|
|
180
|
+
index.slice(interval, parser, &blk)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
#### Internals
|
185
|
+
|
186
|
+
# @api private
|
187
|
+
def initialize(options)
|
188
|
+
@parse_options = options
|
189
|
+
@indices = {}
|
190
|
+
@maf_by_chrom = {}
|
191
|
+
if options[:dir]
|
192
|
+
@dir = options[:dir]
|
193
|
+
@maf_files = Dir.glob("#{@dir}/*.maf")
|
194
|
+
elsif options[:maf]
|
195
|
+
@maf_files = [options[:maf]]
|
196
|
+
if options[:index]
|
197
|
+
register_index(KyotoIndex.open(options[:index]),
|
198
|
+
options[:maf])
|
199
|
+
end
|
200
|
+
else
|
201
|
+
raise "Must specify :dir or :maf!"
|
202
|
+
end
|
203
|
+
scan_indices!
|
204
|
+
if options[:maf] && @indices.empty?
|
205
|
+
# MAF file explicitly given but no index
|
206
|
+
# build a temporary one
|
207
|
+
# (could build a real one, too...)
|
208
|
+
maf = options[:maf]
|
209
|
+
parser = Parser.new(maf, @parse_options)
|
210
|
+
# $stderr.puts "WARNING: building temporary index on #{maf}."
|
211
|
+
index = KyotoIndex.build(parser, '%')
|
212
|
+
register_index(index, maf)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# @api private
|
217
|
+
def find_index_file(maf)
|
218
|
+
base = File.basename(maf, '.maf')
|
219
|
+
index_f = "#{@dir}/#{base}.kct"
|
220
|
+
File.exists?(index_f) ? index_f : nil
|
221
|
+
end
|
222
|
+
|
223
|
+
# @api private
|
224
|
+
def register_index(index, maf)
|
225
|
+
@indices[index.ref_seq] = index
|
226
|
+
@maf_by_chrom[index.ref_seq] = maf
|
227
|
+
end
|
228
|
+
|
229
|
+
# @api private
|
230
|
+
def scan_indices!
|
231
|
+
@maf_files.each do |maf|
|
232
|
+
index_f = find_index_file(maf)
|
233
|
+
if index_f
|
234
|
+
index = KyotoIndex.open(index_f)
|
235
|
+
register_index(index, maf)
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# @api private
|
241
|
+
def chrom_index(chrom)
|
242
|
+
unless @indices.has_key? chrom
|
243
|
+
raise "No index available for chromosome #{chrom}!"
|
244
|
+
end
|
245
|
+
@indices[chrom]
|
246
|
+
end
|
247
|
+
|
248
|
+
# @api private
|
249
|
+
def with_parser(chrom)
|
250
|
+
# $stderr.puts "Creating parser with options #{@parse_options.inspect}"
|
251
|
+
parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
|
252
|
+
parser.sequence_filter = self.sequence_filter
|
253
|
+
begin
|
254
|
+
yield parser
|
255
|
+
ensure
|
256
|
+
parser.close
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
end
|
261
|
+
|
64
262
|
class KyotoIndex
|
65
263
|
include KVHelpers
|
66
264
|
|
@@ -189,15 +387,22 @@ module Bio
|
|
189
387
|
# @param [Parser] parser MAF parser for file to fetch blocks
|
190
388
|
# from.
|
191
389
|
# @param [Hash] filter Block filter expression.
|
192
|
-
# @
|
390
|
+
# @yield [block] each {Block} matched, in turn
|
391
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
193
392
|
# @api public
|
194
|
-
def find(intervals, parser, filter={})
|
195
|
-
start = Time.now
|
393
|
+
def find(intervals, parser, filter={}, &blk)
|
394
|
+
# start = Time.now
|
196
395
|
fl = fetch_list(intervals, filter)
|
197
|
-
$stderr.printf("Built fetch list of %d items in %.3fs.\n",
|
198
|
-
|
199
|
-
|
200
|
-
|
396
|
+
# $stderr.printf("Built fetch list of %d items in %.3fs.\n",
|
397
|
+
# fl.size,
|
398
|
+
# Time.now - start)
|
399
|
+
if ! fl.empty?
|
400
|
+
parser.fetch_blocks(fl, &blk)
|
401
|
+
else
|
402
|
+
if ! block_given?
|
403
|
+
[]
|
404
|
+
end
|
405
|
+
end
|
201
406
|
end
|
202
407
|
|
203
408
|
# Close the underlying Kyoto Cabinet database handle.
|
@@ -205,6 +410,16 @@ module Bio
|
|
205
410
|
db.close
|
206
411
|
end
|
207
412
|
|
413
|
+
def slice(interval, parser, filter={})
|
414
|
+
if block_given?
|
415
|
+
find([interval], parser, filter) do |block|
|
416
|
+
yield block.slice(interval)
|
417
|
+
end
|
418
|
+
else
|
419
|
+
enum_for(:slice, interval, parser, filter)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
208
423
|
#### KyotoIndex Internals
|
209
424
|
# @api private
|
210
425
|
|
@@ -288,7 +503,6 @@ module Bio
|
|
288
503
|
# Build a fetch list of alignment blocks to read, given an array
|
289
504
|
# of Bio::GenomicInterval objects
|
290
505
|
def fetch_list(intervals, filter_spec={})
|
291
|
-
start = Time.now
|
292
506
|
filter_spec ||= {}
|
293
507
|
filters = Filters.build(filter_spec, self)
|
294
508
|
chrom = intervals.first.chrom
|
@@ -309,9 +523,7 @@ module Bio
|
|
309
523
|
bin_intervals.values.each do |intervals|
|
310
524
|
intervals.sort_by! {|i| i.begin}
|
311
525
|
end
|
312
|
-
|
313
|
-
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
314
|
-
matches = if RUBY_PLATFORM == 'java'
|
526
|
+
matches = if RUBY_PLATFORM == 'java' && bin_intervals.size > 4
|
315
527
|
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
316
528
|
else
|
317
529
|
scan_bins(chrom_id, bin_intervals, filters)
|