bio-maf 0.2.0-java → 0.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +3 -1
- data/README.md +98 -29
- data/Rakefile +6 -2
- data/bin/maf_tile +59 -35
- data/bio-maf.gemspec +4 -3
- data/features/block-joining.feature +32 -0
- data/features/dir-access.feature +46 -0
- data/features/maf-indexing.feature +23 -0
- data/features/maf-to-fasta.feature +9 -0
- data/features/slice.feature +54 -0
- data/features/step_definitions/dir-access_steps.rb +15 -0
- data/features/step_definitions/file_steps.rb +7 -0
- data/features/step_definitions/gap_removal_steps.rb +4 -0
- data/features/step_definitions/index_steps.rb +3 -3
- data/features/step_definitions/output_steps.rb +9 -1
- data/features/step_definitions/parse_steps.rb +13 -2
- data/features/step_definitions/query_steps.rb +7 -6
- data/features/step_definitions/slice_steps.rb +15 -0
- data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
- data/features/support/aruba.rb +1 -0
- data/features/support/env.rb +3 -1
- data/features/{gap-filling.feature → tiling.feature} +85 -0
- data/lib/bio/maf/index.rb +223 -11
- data/lib/bio/maf/maf.rb +209 -0
- data/lib/bio/maf/parser.rb +190 -111
- data/lib/bio/maf/tiler.rb +33 -6
- data/man/maf_index.1 +1 -1
- data/man/maf_tile.1 +7 -7
- data/man/maf_tile.1.ronn +21 -13
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +99 -0
- data/spec/bio/maf/maf_spec.rb +184 -0
- data/spec/bio/maf/parser_spec.rb +75 -115
- data/spec/bio/maf/tiler_spec.rb +44 -0
- data/test/data/chr22_ieq2.maf +11 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/gap-1.maf +9 -0
- data/test/data/gap-filled1.fa +6 -0
- data/test/data/gap-sp1.fa.gz +0 -0
- data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
- data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
- data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.maf +1000 -0
- metadata +65 -16
@@ -29,3 +29,26 @@ Feature: Indexed access to MAF files
|
|
29
29
|
Then 2 blocks are obtained
|
30
30
|
And sequence mm8.chr7 of block 0 has start 80082592
|
31
31
|
And sequence mm8.chr7 of block 1 has start 80082713
|
32
|
+
|
33
|
+
@no_jruby
|
34
|
+
Scenario: Build MAF index with CLI tool
|
35
|
+
Given test files:
|
36
|
+
| mm8_chr7_tiny.maf |
|
37
|
+
When I run `maf_index mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
|
38
|
+
Then it should pass with:
|
39
|
+
"""
|
40
|
+
"""
|
41
|
+
And a file named "mm8_chr7_tiny.kct" should exist
|
42
|
+
|
43
|
+
@no_jruby
|
44
|
+
Scenario: Dump MAF index with CLI tool
|
45
|
+
Given test files:
|
46
|
+
| mm8_chr7_tiny.maf |
|
47
|
+
| mm8_chr7_tiny.kct |
|
48
|
+
When I run `maf_index -d mm8_chr7_tiny.kct`
|
49
|
+
Then it should pass with regex:
|
50
|
+
"""
|
51
|
+
0 \[bin 1195\] 80082334:80082368
|
52
|
+
"""
|
53
|
+
|
54
|
+
|
@@ -48,3 +48,12 @@ Feature: Convert MAF file to FASTA
|
|
48
48
|
|
49
49
|
"""
|
50
50
|
|
51
|
+
@no_jruby
|
52
|
+
Scenario: Convert MAF to FASTA with CLI tool
|
53
|
+
Given test files:
|
54
|
+
| mm8_chr7_tiny.maf |
|
55
|
+
When I run `maf_to_fasta mm8_chr7_tiny.maf mm8_chr7_tiny.fa`
|
56
|
+
Then it should pass with:
|
57
|
+
"""
|
58
|
+
"""
|
59
|
+
And the file "mm8_chr7_tiny.fa" should contain ">rn4.chr1:136011785-136011819"
|
@@ -0,0 +1,54 @@
|
|
1
|
+
Feature: MAF slicing
|
2
|
+
In order to obtain just the alignment data covering a given region
|
3
|
+
I want to be able to take slices of alignment blocks over
|
4
|
+
A given interval
|
5
|
+
|
6
|
+
Scenario: Interval covering two blocks
|
7
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
8
|
+
And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
|
9
|
+
When I open it with a MAF reader
|
10
|
+
And I enable the :remove_gaps parser option
|
11
|
+
And open a new MAF writer
|
12
|
+
And write the header from the original MAF file
|
13
|
+
And filter for only the species
|
14
|
+
| mm8 |
|
15
|
+
| rn4 |
|
16
|
+
And search for blocks between positions 80082350 and 80082380 of mm8.chr7
|
17
|
+
And slice the resulting blocks according to the given interval
|
18
|
+
And write all the matched blocks
|
19
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
20
|
+
|
21
|
+
Scenario: Interval covering two blocks, using directory access
|
22
|
+
Given indexed MAF files in "test/data"
|
23
|
+
When I enable the :remove_gaps parser option
|
24
|
+
And open a new MAF writer
|
25
|
+
And write a default header
|
26
|
+
And filter for only the species
|
27
|
+
| mm8 |
|
28
|
+
| rn4 |
|
29
|
+
And I extract a slice over the genomic interval
|
30
|
+
| chrom | start | end |
|
31
|
+
| mm8.chr7 | 80082350 | 80082380 |
|
32
|
+
And write all the matched blocks
|
33
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
34
|
+
|
35
|
+
Scenario: Interval in block subset
|
36
|
+
Given indexed MAF files in "test/data"
|
37
|
+
When I open a new MAF writer
|
38
|
+
And write a default header
|
39
|
+
And I extract a slice over the genomic interval
|
40
|
+
| chrom | start | end |
|
41
|
+
| mm8.chr7 | 80082718 | 80082728 |
|
42
|
+
And write all the matched blocks
|
43
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice2.maf"
|
44
|
+
|
45
|
+
Scenario: Interval to end of block
|
46
|
+
Given indexed MAF files in "test/data"
|
47
|
+
When I open a new MAF writer
|
48
|
+
And write a default header
|
49
|
+
And I extract a slice over the genomic interval
|
50
|
+
| chrom | start | end |
|
51
|
+
| mm8.chr7 | 80082757 | 80082767 |
|
52
|
+
And write all the matched blocks
|
53
|
+
Then the output should match, except whitespace, "mm8_chr7_tiny_slice3.maf"
|
54
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Given /^indexed MAF files in "(.*?)"$/ do |dir|
|
2
|
+
@opts ||= {}
|
3
|
+
@access = Bio::MAF::Access.maf_dir(dir, @opts)
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^I query for the genomic intervals$/ do |table|
|
7
|
+
# table is a Cucumber::Ast::Table
|
8
|
+
intervals = table.hashes.collect do |row|
|
9
|
+
Bio::GenomicInterval.zero_based(row['chrom'],
|
10
|
+
row['start'].to_i,
|
11
|
+
row['end'].to_i)
|
12
|
+
end
|
13
|
+
@access.block_filter = @block_filter
|
14
|
+
@blocks = @access.find(intervals).to_a
|
15
|
+
end
|
@@ -11,10 +11,10 @@ Then /^the index has at least (\d+) entries$/ do |size_spec|
|
|
11
11
|
end
|
12
12
|
|
13
13
|
When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
|
14
|
-
|
15
|
-
@blocks = @idx.find([
|
14
|
+
@interval = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
|
15
|
+
@blocks = @idx.find([@interval], @parser, @block_filter).to_a
|
16
16
|
end
|
17
17
|
|
18
18
|
Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
|
19
|
-
@blocks.
|
19
|
+
@blocks.count.should == num.to_i
|
20
20
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
When /^open a new MAF writer$/ do
|
1
|
+
When /^(?:I )?open a new MAF writer$/ do
|
2
2
|
@dst = Tempfile.new(["cuke", ".maf"])
|
3
3
|
@writer = Bio::MAF::Writer.new(@dst)
|
4
4
|
end
|
@@ -7,10 +7,18 @@ When /^write the header from the original MAF file$/ do
|
|
7
7
|
@writer.write_header(@parser.header)
|
8
8
|
end
|
9
9
|
|
10
|
+
When /^write a default header$/ do
|
11
|
+
@writer.write_header(Bio::MAF::Header.default)
|
12
|
+
end
|
13
|
+
|
10
14
|
When /^write all the parsed blocks$/ do
|
11
15
|
@writer.write_blocks(@parser.parse_blocks)
|
12
16
|
end
|
13
17
|
|
18
|
+
When /^write all the matched blocks$/ do
|
19
|
+
@writer.write_blocks(@blocks)
|
20
|
+
end
|
21
|
+
|
14
22
|
RSpec::Matchers.define :match_except_ws do |expected|
|
15
23
|
match do |actual|
|
16
24
|
system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
|
@@ -4,8 +4,15 @@ When /^I open it with a MAF reader$/ do
|
|
4
4
|
end
|
5
5
|
|
6
6
|
When /^I enable the :(\S+) parser option$/ do |opt_s|
|
7
|
-
@
|
8
|
-
|
7
|
+
if @parser
|
8
|
+
opts = @parser.opts
|
9
|
+
elsif @access
|
10
|
+
opts = @access.parse_options
|
11
|
+
else
|
12
|
+
@opts ||= {}
|
13
|
+
opts = @opts
|
14
|
+
end
|
15
|
+
opts[opt_s.to_sym] = true
|
9
16
|
end
|
10
17
|
|
11
18
|
Then /^the MAF version should be "(.*?)"$/ do |v_spec|
|
@@ -29,6 +36,10 @@ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
|
|
29
36
|
@block.sequences.size.should == n_seq.to_i
|
30
37
|
end
|
31
38
|
|
39
|
+
Then /^block (\d+) has (\d+) sequences$/ do |block_n, n_seq|
|
40
|
+
@blocks[block_n.to_i].sequences.size.should == n_seq.to_i
|
41
|
+
end
|
42
|
+
|
32
43
|
Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
|
33
44
|
method_sym = method.gsub(/ /, '_').to_sym
|
34
45
|
@block.raw_seq(i.to_i).send(method_sym).should == str
|
@@ -1,20 +1,21 @@
|
|
1
|
-
When /^filter for only the species$/ do |table|
|
1
|
+
When /^(?:I )?filter for only the species$/ do |table|
|
2
2
|
# table is a Cucumber::Ast::Table
|
3
3
|
sp = table.raw.collect { |row| row[0] }
|
4
|
-
|
4
|
+
thing = @access || @parser
|
5
|
+
thing.sequence_filter = { :only_species => sp }
|
5
6
|
end
|
6
7
|
|
7
|
-
When /^filter for blocks with the species$/ do |table|
|
8
|
+
When /^(?:I )?filter for blocks with the species$/ do |table|
|
8
9
|
# table is a Cucumber::Ast::Table
|
9
10
|
sp = table.raw.collect { |row| row[0] }
|
10
11
|
@block_filter = { :with_all_species => sp }
|
11
12
|
end
|
12
13
|
|
13
|
-
When /^filter for blocks with at least (\d+) sequences$/ do |n|
|
14
|
+
When /^(?:I )?filter for blocks with at least (\d+) sequences$/ do |n|
|
14
15
|
@block_filter = { :at_least_n_sequences => n.to_i }
|
15
16
|
end
|
16
17
|
|
17
|
-
When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
|
+
When /^(?:I )?filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
19
|
constraint = case op
|
19
20
|
when 'least' then :min_size
|
20
21
|
when 'most' then :max_size
|
@@ -23,7 +24,7 @@ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
|
23
24
|
@block_filter = { constraint => len.to_i}
|
24
25
|
end
|
25
26
|
|
26
|
-
When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
|
+
When /^(?:I )?filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
28
|
@block_filter = {
|
28
29
|
:min_size => min.to_i,
|
29
30
|
:max_size => max.to_i
|
@@ -0,0 +1,15 @@
|
|
1
|
+
When /^slice the resulting blocks according to the given interval$/ do
|
2
|
+
# @blocks and @interval
|
3
|
+
@blocks = @blocks.collect { |b| b.slice(@interval) }
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^I extract a slice over the genomic interval$/ do |table|
|
7
|
+
# table is a Cucumber::Ast::Table
|
8
|
+
intervals = table.hashes.collect do |row|
|
9
|
+
Bio::GenomicInterval.zero_based(row['chrom'],
|
10
|
+
row['start'].to_i,
|
11
|
+
row['end'].to_i)
|
12
|
+
end
|
13
|
+
intervals.size.should == 1
|
14
|
+
@blocks = @access.slice(intervals[0])
|
15
|
+
end
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'aruba/cucumber'
|
data/features/support/env.rb
CHANGED
@@ -154,5 +154,90 @@ Feature: Join alignment blocks with reference data
|
|
154
154
|
**********AGGTTTAGGG******************************
|
155
155
|
"""
|
156
156
|
|
157
|
+
@no_jruby
|
158
|
+
Scenario: Tile with CLI tool and reference seq
|
159
|
+
Given test files:
|
160
|
+
| gap-sp1.fa.gz |
|
161
|
+
| gap-1.maf |
|
162
|
+
| gap-1.kct |
|
163
|
+
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
164
|
+
Then it should pass with:
|
165
|
+
"""
|
166
|
+
>mouse
|
167
|
+
CCAGGATGCTGGGCTGAGGGC--AGTTGTGTCAGGGCGGTCCGGTGCAGGCA
|
168
|
+
>nautilus
|
169
|
+
**********GGGCTGACGGC--AG*******AGGGCGGTGC**********
|
170
|
+
>jaguar
|
171
|
+
**********AGGTTTAGGGCAGAG***************************
|
172
|
+
"""
|
173
|
+
|
174
|
+
@no_jruby
|
175
|
+
Scenario: Tile with CLI tool and no reference seq
|
176
|
+
Given test files:
|
177
|
+
| gap-1.maf |
|
178
|
+
| gap-1.kct |
|
179
|
+
When I run `maf_tile --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
180
|
+
Then it should pass with:
|
181
|
+
"""
|
182
|
+
>mouse
|
183
|
+
NNNNNNNNNNGGGCTGAGGGC--AGNNNNNNNAGGGCGGTCCNNNNNNNNNN
|
184
|
+
>nautilus
|
185
|
+
**********GGGCTGACGGC--AG*******AGGGCGGTGC**********
|
186
|
+
>jaguar
|
187
|
+
**********AGGTTTAGGGCAGAG***************************
|
188
|
+
"""
|
189
|
+
|
190
|
+
@no_jruby
|
191
|
+
Scenario: Tile with CLI tool and BED intervals
|
192
|
+
Given test files:
|
193
|
+
| gap-1.maf |
|
194
|
+
| gap-1.kct |
|
195
|
+
| gap-sp1.fa.gz |
|
196
|
+
And a file named "example.bed" with:
|
197
|
+
"""
|
198
|
+
sp1.chr1 12 36
|
199
|
+
"""
|
200
|
+
When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
|
201
|
+
Then the file "selected_12-36.fa" should contain exactly:
|
202
|
+
"""
|
203
|
+
>mouse
|
204
|
+
GCTGAGGGC--AGTTGTGTCAGGGCG
|
205
|
+
>nautilus
|
206
|
+
GCTGACGGC--AG*******AGGGCG
|
207
|
+
>jaguar
|
208
|
+
GTTTAGGGCAGAG*************
|
209
|
+
|
210
|
+
"""
|
157
211
|
|
212
|
+
@no_jruby
|
213
|
+
Scenario: Tile with CLI tool and implicit index
|
214
|
+
Given test files:
|
215
|
+
| mm8_chr7_tiny.maf |
|
216
|
+
| mm8_chr7_tiny.kct |
|
217
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334:80082344 mm8_chr7_tiny.maf`
|
218
|
+
Then it should pass with:
|
219
|
+
"""
|
220
|
+
>mm8
|
221
|
+
GGGCTGAGGG
|
222
|
+
>rn4
|
223
|
+
GGGCTGAGGG
|
224
|
+
>hg18
|
225
|
+
--------GG
|
226
|
+
"""
|
227
|
+
|
228
|
+
@no_jruby
|
229
|
+
Scenario: Tile with CLI tool and directory
|
230
|
+
Given test files:
|
231
|
+
| mm8_chr7_tiny.maf |
|
232
|
+
| mm8_chr7_tiny.kct |
|
233
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334:80082344 .`
|
234
|
+
Then it should pass with:
|
235
|
+
"""
|
236
|
+
>mm8
|
237
|
+
GGGCTGAGGG
|
238
|
+
>rn4
|
239
|
+
GGGCTGAGGG
|
240
|
+
>hg18
|
241
|
+
--------GG
|
242
|
+
"""
|
158
243
|
|
data/lib/bio/maf/index.rb
CHANGED
@@ -61,6 +61,204 @@ module Bio
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
|
64
|
+
# Top-level class for working with a set of indexed MAF
|
65
|
+
# files. Provides a higher-level alternative to working with
|
66
|
+
# {Parser} and {KyotoIndex} objects directly.
|
67
|
+
#
|
68
|
+
# Instantiate with {Access.maf_dir} and {Access.file} methods.
|
69
|
+
class Access
|
70
|
+
|
71
|
+
# Parser options.
|
72
|
+
# @return [Hash]
|
73
|
+
# @see Parser
|
74
|
+
attr_accessor :parse_options
|
75
|
+
# Sequence filter to apply.
|
76
|
+
# @return [Hash]
|
77
|
+
# @see Parser#sequence_filter
|
78
|
+
attr_accessor :sequence_filter
|
79
|
+
# Block filter to apply.
|
80
|
+
# @return [Hash]
|
81
|
+
# @see KyotoIndex#find
|
82
|
+
attr_accessor :block_filter
|
83
|
+
attr_reader :indices
|
84
|
+
|
85
|
+
# Provides access to a directory of indexed MAF files. Any files
|
86
|
+
# with .maf suffixes and accompanying .kct indexes in the given
|
87
|
+
# directory will be accessible.
|
88
|
+
# @param [String] dir directory to scan
|
89
|
+
# @param [Hash] options parser options
|
90
|
+
# @return [Access]
|
91
|
+
def self.maf_dir(dir, options={})
|
92
|
+
o = options.dup
|
93
|
+
o[:dir] = dir
|
94
|
+
self.new(o)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Provides access to a single MAF file. If this file is not
|
98
|
+
# indexed, it will be fully parsed to create a temporary
|
99
|
+
# in-memory index. For large MAF files or ones which will be
|
100
|
+
# used multiple times, this is inefficient, and an index file
|
101
|
+
# should be created with maf_index(1).
|
102
|
+
#
|
103
|
+
# @param [String] maf path to MAF file
|
104
|
+
# @param [String] index Kyoto Cabinet index file
|
105
|
+
# @param [Hash] options parser options
|
106
|
+
# @return [Access]
|
107
|
+
def self.file(maf, index=nil, options={})
|
108
|
+
o = options.dup
|
109
|
+
o[:maf] = maf
|
110
|
+
o[:index] = index if index
|
111
|
+
self.new(o)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Close all open resources, in particular Kyoto Cabinet database
|
115
|
+
# handles.
|
116
|
+
def close
|
117
|
+
@indices.values.each { |ki| ki.close }
|
118
|
+
end
|
119
|
+
|
120
|
+
# Find all alignment blocks in the genomic regions in the list
|
121
|
+
# of Bio::GenomicInterval objects, and parse them with the given
|
122
|
+
# parser.
|
123
|
+
#
|
124
|
+
# @param [Enumerable<Bio::GenomicInterval>] intervals genomic
|
125
|
+
# intervals to parse.
|
126
|
+
# @yield [block] each {Block} matched, in turn
|
127
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
128
|
+
# @api public
|
129
|
+
# @see KyotoIndex#find
|
130
|
+
def find(intervals, &blk)
|
131
|
+
if block_given?
|
132
|
+
by_chrom = intervals.group_by { |i| i.chrom }
|
133
|
+
by_chrom.keys.each do |chrom|
|
134
|
+
unless @indices.has_key? chrom
|
135
|
+
raise "No index available for chromosome #{chrom}!"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
by_chrom.each do |chrom, c_intervals|
|
139
|
+
index = @indices[chrom]
|
140
|
+
with_parser(chrom) do |parser|
|
141
|
+
index.find(c_intervals, parser, block_filter, &blk)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
else
|
145
|
+
enum_for(:find, intervals)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Find and parse all alignment blocks in the genomic region
|
150
|
+
# given by a Bio::GenomicInterval, and combine them to
|
151
|
+
# synthesize a single alignment covering that interval
|
152
|
+
# exactly.
|
153
|
+
#
|
154
|
+
# @param [Bio::GenomicInterval] interval interval to search
|
155
|
+
# @yield [tiler] a {Tiler} ready to operate on the given interval
|
156
|
+
# @api public
|
157
|
+
def tile(interval)
|
158
|
+
index = chrom_index(interval.chrom)
|
159
|
+
with_parser(interval.chrom) do |parser|
|
160
|
+
tiler = Tiler.new
|
161
|
+
tiler.index = index
|
162
|
+
tiler.parser = parser
|
163
|
+
tiler.interval = interval
|
164
|
+
yield tiler
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Find and parse all alignment blocks in the genomic region
|
169
|
+
# given by a Bio::GenomicInterval, and truncate them to just the
|
170
|
+
# region intersecting that interval.
|
171
|
+
#
|
172
|
+
# @param [Bio::GenomicInterval] interval interval to search
|
173
|
+
# @yield [block] each {Block} matched, in turn
|
174
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
175
|
+
# @api public
|
176
|
+
# @see KyotoIndex#slice
|
177
|
+
def slice(interval, &blk)
|
178
|
+
index = chrom_index(interval.chrom)
|
179
|
+
with_parser(interval.chrom) do |parser|
|
180
|
+
index.slice(interval, parser, &blk)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
#### Internals
|
185
|
+
|
186
|
+
# @api private
|
187
|
+
def initialize(options)
|
188
|
+
@parse_options = options
|
189
|
+
@indices = {}
|
190
|
+
@maf_by_chrom = {}
|
191
|
+
if options[:dir]
|
192
|
+
@dir = options[:dir]
|
193
|
+
@maf_files = Dir.glob("#{@dir}/*.maf")
|
194
|
+
elsif options[:maf]
|
195
|
+
@maf_files = [options[:maf]]
|
196
|
+
if options[:index]
|
197
|
+
register_index(KyotoIndex.open(options[:index]),
|
198
|
+
options[:maf])
|
199
|
+
end
|
200
|
+
else
|
201
|
+
raise "Must specify :dir or :maf!"
|
202
|
+
end
|
203
|
+
scan_indices!
|
204
|
+
if options[:maf] && @indices.empty?
|
205
|
+
# MAF file explicitly given but no index
|
206
|
+
# build a temporary one
|
207
|
+
# (could build a real one, too...)
|
208
|
+
maf = options[:maf]
|
209
|
+
parser = Parser.new(maf, @parse_options)
|
210
|
+
# $stderr.puts "WARNING: building temporary index on #{maf}."
|
211
|
+
index = KyotoIndex.build(parser, '%')
|
212
|
+
register_index(index, maf)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# @api private
|
217
|
+
def find_index_file(maf)
|
218
|
+
base = File.basename(maf, '.maf')
|
219
|
+
index_f = "#{@dir}/#{base}.kct"
|
220
|
+
File.exists?(index_f) ? index_f : nil
|
221
|
+
end
|
222
|
+
|
223
|
+
# @api private
|
224
|
+
def register_index(index, maf)
|
225
|
+
@indices[index.ref_seq] = index
|
226
|
+
@maf_by_chrom[index.ref_seq] = maf
|
227
|
+
end
|
228
|
+
|
229
|
+
# @api private
|
230
|
+
def scan_indices!
|
231
|
+
@maf_files.each do |maf|
|
232
|
+
index_f = find_index_file(maf)
|
233
|
+
if index_f
|
234
|
+
index = KyotoIndex.open(index_f)
|
235
|
+
register_index(index, maf)
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# @api private
|
241
|
+
def chrom_index(chrom)
|
242
|
+
unless @indices.has_key? chrom
|
243
|
+
raise "No index available for chromosome #{chrom}!"
|
244
|
+
end
|
245
|
+
@indices[chrom]
|
246
|
+
end
|
247
|
+
|
248
|
+
# @api private
|
249
|
+
def with_parser(chrom)
|
250
|
+
# $stderr.puts "Creating parser with options #{@parse_options.inspect}"
|
251
|
+
parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
|
252
|
+
parser.sequence_filter = self.sequence_filter
|
253
|
+
begin
|
254
|
+
yield parser
|
255
|
+
ensure
|
256
|
+
parser.close
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
end
|
261
|
+
|
64
262
|
class KyotoIndex
|
65
263
|
include KVHelpers
|
66
264
|
|
@@ -189,15 +387,22 @@ module Bio
|
|
189
387
|
# @param [Parser] parser MAF parser for file to fetch blocks
|
190
388
|
# from.
|
191
389
|
# @param [Hash] filter Block filter expression.
|
192
|
-
# @
|
390
|
+
# @yield [block] each {Block} matched, in turn
|
391
|
+
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
193
392
|
# @api public
|
194
|
-
def find(intervals, parser, filter={})
|
195
|
-
start = Time.now
|
393
|
+
def find(intervals, parser, filter={}, &blk)
|
394
|
+
# start = Time.now
|
196
395
|
fl = fetch_list(intervals, filter)
|
197
|
-
$stderr.printf("Built fetch list of %d items in %.3fs.\n",
|
198
|
-
|
199
|
-
|
200
|
-
|
396
|
+
# $stderr.printf("Built fetch list of %d items in %.3fs.\n",
|
397
|
+
# fl.size,
|
398
|
+
# Time.now - start)
|
399
|
+
if ! fl.empty?
|
400
|
+
parser.fetch_blocks(fl, &blk)
|
401
|
+
else
|
402
|
+
if ! block_given?
|
403
|
+
[]
|
404
|
+
end
|
405
|
+
end
|
201
406
|
end
|
202
407
|
|
203
408
|
# Close the underlying Kyoto Cabinet database handle.
|
@@ -205,6 +410,16 @@ module Bio
|
|
205
410
|
db.close
|
206
411
|
end
|
207
412
|
|
413
|
+
def slice(interval, parser, filter={})
|
414
|
+
if block_given?
|
415
|
+
find([interval], parser, filter) do |block|
|
416
|
+
yield block.slice(interval)
|
417
|
+
end
|
418
|
+
else
|
419
|
+
enum_for(:slice, interval, parser, filter)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
208
423
|
#### KyotoIndex Internals
|
209
424
|
# @api private
|
210
425
|
|
@@ -288,7 +503,6 @@ module Bio
|
|
288
503
|
# Build a fetch list of alignment blocks to read, given an array
|
289
504
|
# of Bio::GenomicInterval objects
|
290
505
|
def fetch_list(intervals, filter_spec={})
|
291
|
-
start = Time.now
|
292
506
|
filter_spec ||= {}
|
293
507
|
filters = Filters.build(filter_spec, self)
|
294
508
|
chrom = intervals.first.chrom
|
@@ -309,9 +523,7 @@ module Bio
|
|
309
523
|
bin_intervals.values.each do |intervals|
|
310
524
|
intervals.sort_by! {|i| i.begin}
|
311
525
|
end
|
312
|
-
|
313
|
-
$stderr.puts "bin intervals computed after #{ready - start} seconds."
|
314
|
-
matches = if RUBY_PLATFORM == 'java'
|
526
|
+
matches = if RUBY_PLATFORM == 'java' && bin_intervals.size > 4
|
315
527
|
scan_bins_parallel(chrom_id, bin_intervals, filters)
|
316
528
|
else
|
317
529
|
scan_bins(chrom_id, bin_intervals, filters)
|