bio-maf 1.0.0-java → 1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/bio-maf.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "1.0.
|
5
|
+
s.version = "1.0.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-08-
|
9
|
+
s.date = "2012-08-08"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
12
|
s.extra_rdoc_files = [
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
end
|
33
33
|
|
34
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
35
|
-
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.
|
35
|
+
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.1"])
|
36
36
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
37
37
|
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
38
38
|
if RUBY_PLATFORM == 'java'
|
data/features/bgzf.feature
CHANGED
@@ -60,3 +60,48 @@ Feature: BGZF compression
|
|
60
60
|
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
61
61
|
And a file named "mm8.chrM.maf.bgz" should exist
|
62
62
|
|
63
|
+
@no_jruby
|
64
|
+
Scenario: Don't overwrite MAF files
|
65
|
+
Given test files:
|
66
|
+
| mm8.chrM.maf |
|
67
|
+
| mm8.chrM.maf.bgz |
|
68
|
+
When I run `maf_bgzip mm8.chrM.maf`
|
69
|
+
Then it should fail with:
|
70
|
+
"""
|
71
|
+
exists
|
72
|
+
"""
|
73
|
+
|
74
|
+
@no_jruby
|
75
|
+
Scenario: Don't overwrite indexes
|
76
|
+
Given test files:
|
77
|
+
| mm8_chr7_tiny.maf |
|
78
|
+
When I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
79
|
+
And I run `rm mm8_chr7_tiny.maf.bgz`
|
80
|
+
And I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
81
|
+
Then it should fail with:
|
82
|
+
"""
|
83
|
+
exists
|
84
|
+
"""
|
85
|
+
|
86
|
+
@no_jruby
|
87
|
+
Scenario: Overwrite MAF files with --force
|
88
|
+
Given test files:
|
89
|
+
| mm8.chrM.maf |
|
90
|
+
| mm8.chrM.maf.bgz |
|
91
|
+
When I run `maf_bgzip --force mm8.chrM.maf`
|
92
|
+
Then it should pass with:
|
93
|
+
"""
|
94
|
+
"""
|
95
|
+
|
96
|
+
@no_jruby
|
97
|
+
Scenario: Overwrite indexes with --force
|
98
|
+
Given test files:
|
99
|
+
| mm8_chr7_tiny.maf |
|
100
|
+
When I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
101
|
+
And I run `rm mm8_chr7_tiny.maf.bgz`
|
102
|
+
And I run `maf_bgzip --force --index mm8_chr7_tiny.maf`
|
103
|
+
Then it should pass with:
|
104
|
+
"""
|
105
|
+
"""
|
106
|
+
|
107
|
+
|
@@ -39,6 +39,12 @@ Feature: Indexed access to MAF files
|
|
39
39
|
And sequence mm8.chr7 of block 0 has start 80082368
|
40
40
|
And sequence mm8.chr7 of block 1 has start 80082471
|
41
41
|
|
42
|
+
Scenario: Index MAF file with extended bin positions
|
43
|
+
Given a MAF source file "ext-bin.maf"
|
44
|
+
When I open it with a MAF reader
|
45
|
+
And build an index on all sequences
|
46
|
+
Then the index has at least 18 entries
|
47
|
+
|
42
48
|
@no_jruby
|
43
49
|
Scenario: Build MAF index with CLI tool
|
44
50
|
Given test files:
|
@@ -42,3 +42,20 @@ Feature: Parse MAF files
|
|
42
42
|
And sequence 0 has text "ACA-TTACT"
|
43
43
|
And sequence 1 has strand :-
|
44
44
|
|
45
|
+
Scenario: Read alignment block, folded to upper case
|
46
|
+
Given MAF data:
|
47
|
+
"""
|
48
|
+
##maf version=1 scoring=humor.v4
|
49
|
+
# humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
|
50
|
+
# /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
|
51
|
+
|
52
|
+
a score=0.128
|
53
|
+
s human_hoxa 100 8 + 100257 aca-ttact
|
54
|
+
s horse_hoxa 120 9 - 98892 acaattgct
|
55
|
+
s fugu_hoxa 88 7 + 90788 aca--tgct
|
56
|
+
"""
|
57
|
+
When I enable the :upcase parser option
|
58
|
+
And I open it with a MAF reader
|
59
|
+
Then an alignment block can be obtained
|
60
|
+
And the alignment block has 3 sequences
|
61
|
+
And sequence 0 has text "ACA-TTACT"
|
@@ -82,3 +82,14 @@ Feature: Filter results from MAF files
|
|
82
82
|
And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
|
83
83
|
And I run `diff m1.maf m2.maf`
|
84
84
|
Then the exit status should be 0
|
85
|
+
|
86
|
+
@no_jruby
|
87
|
+
Scenario: One-based indexing with maf_extract
|
88
|
+
Given test files:
|
89
|
+
| mm8_chr7_tiny.maf |
|
90
|
+
| mm8_chr7_tiny.kct |
|
91
|
+
When I run `sh -c 'maf_extract -d . --one-based --interval mm8.chr7:80082592-80082713 | grep "^a" | wc -l'`
|
92
|
+
Then it should pass with:
|
93
|
+
"""
|
94
|
+
2
|
95
|
+
"""
|
data/features/slice.feature
CHANGED
@@ -18,6 +18,17 @@ Feature: MAF slicing
|
|
18
18
|
And write all the matched blocks
|
19
19
|
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
20
20
|
|
21
|
+
Scenario: Interval covering two blocks, using directory access, counting
|
22
|
+
Given indexed MAF files in "test/data"
|
23
|
+
When I enable the :remove_gaps parser option
|
24
|
+
And filter for only the species
|
25
|
+
| mm8 |
|
26
|
+
| rn4 |
|
27
|
+
And I extract a slice over the genomic interval
|
28
|
+
| chrom | start | end |
|
29
|
+
| mm8.chr7 | 80082350 | 80082380 |
|
30
|
+
Then 2 blocks are obtained
|
31
|
+
|
21
32
|
Scenario: Interval covering two blocks, using directory access
|
22
33
|
Given indexed MAF files in "test/data"
|
23
34
|
When I enable the :remove_gaps parser option
|
data/features/tiling.feature
CHANGED
@@ -160,7 +160,7 @@ Feature: Join alignment blocks with reference data
|
|
160
160
|
| gap-sp1.fa.gz |
|
161
161
|
| gap-1.maf |
|
162
162
|
| gap-1.kct |
|
163
|
-
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0
|
163
|
+
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
164
164
|
Then it should pass with:
|
165
165
|
"""
|
166
166
|
>mouse
|
@@ -176,7 +176,7 @@ Feature: Join alignment blocks with reference data
|
|
176
176
|
Given test files:
|
177
177
|
| gap-1.maf |
|
178
178
|
| gap-1.kct |
|
179
|
-
When I run `maf_tile --interval 0
|
179
|
+
When I run `maf_tile --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
180
180
|
Then it should pass with:
|
181
181
|
"""
|
182
182
|
>mouse
|
@@ -198,7 +198,10 @@ Feature: Join alignment blocks with reference data
|
|
198
198
|
sp1.chr1 12 36
|
199
199
|
"""
|
200
200
|
When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
|
201
|
-
Then
|
201
|
+
Then it should pass with:
|
202
|
+
"""
|
203
|
+
"""
|
204
|
+
And the file "selected_12-36.fa" should contain exactly:
|
202
205
|
"""
|
203
206
|
>mouse
|
204
207
|
GCTGAGGGC--AGTTGTGTCAGGGCG
|
@@ -214,7 +217,7 @@ Feature: Join alignment blocks with reference data
|
|
214
217
|
Given test files:
|
215
218
|
| mm8_chr7_tiny.maf |
|
216
219
|
| mm8_chr7_tiny.kct |
|
217
|
-
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334
|
220
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334-80082344 mm8_chr7_tiny.maf`
|
218
221
|
Then it should pass with:
|
219
222
|
"""
|
220
223
|
>mm8
|
@@ -230,7 +233,7 @@ Feature: Join alignment blocks with reference data
|
|
230
233
|
Given test files:
|
231
234
|
| mm8_chr7_tiny.maf |
|
232
235
|
| mm8_chr7_tiny.kct |
|
233
|
-
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334
|
236
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334-80082344 .`
|
234
237
|
Then it should pass with:
|
235
238
|
"""
|
236
239
|
>mm8
|
@@ -241,3 +244,18 @@ Feature: Join alignment blocks with reference data
|
|
241
244
|
--------GG
|
242
245
|
"""
|
243
246
|
|
247
|
+
@no_jruby
|
248
|
+
Scenario: Tile with CLI tool and directory, 1-based
|
249
|
+
Given test files:
|
250
|
+
| mm8_chr7_tiny.maf |
|
251
|
+
| mm8_chr7_tiny.kct |
|
252
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --one-based --interval mm8.chr7:80082335-80082344 .`
|
253
|
+
Then it should pass with:
|
254
|
+
"""
|
255
|
+
>mm8
|
256
|
+
GGGCTGAGGG
|
257
|
+
>rn4
|
258
|
+
GGGCTGAGGG
|
259
|
+
>hg18
|
260
|
+
--------GG
|
261
|
+
"""
|
data/lib/bio-maf.rb
CHANGED
@@ -11,7 +11,11 @@
|
|
11
11
|
require 'bio-logger'
|
12
12
|
log = Bio::Log::LoggerPlus.new('bio-maf')
|
13
13
|
log.outputters = Bio::Log::Outputter.stderr
|
14
|
-
log.level =
|
14
|
+
log.level = if ENV['BIO_MAF_DEBUG']
|
15
|
+
Bio::Log::DEBUG
|
16
|
+
else
|
17
|
+
Bio::Log::WARN
|
18
|
+
end
|
15
19
|
|
16
20
|
require 'bio/ucsc'
|
17
21
|
require 'bio/maf'
|
data/lib/bio/maf.rb
CHANGED
data/lib/bio/maf/index.rb
CHANGED
@@ -125,7 +125,7 @@ module Bio
|
|
125
125
|
# @param [Enumerable<Bio::GenomicInterval>] intervals genomic
|
126
126
|
# intervals to parse.
|
127
127
|
# @yield [block] each {Block} matched, in turn
|
128
|
-
# @return [
|
128
|
+
# @return [Array<Block>] each matching {Block}, if no block given
|
129
129
|
# @api public
|
130
130
|
# @see KyotoIndex#find
|
131
131
|
def find(intervals, &blk)
|
@@ -137,13 +137,16 @@ module Bio
|
|
137
137
|
end
|
138
138
|
end
|
139
139
|
by_chrom.each do |chrom, c_intervals|
|
140
|
-
|
141
|
-
|
142
|
-
|
140
|
+
with_index(chrom) do |index|
|
141
|
+
with_parser(chrom) do |parser|
|
142
|
+
index.find(c_intervals, parser, block_filter, &blk)
|
143
|
+
end
|
143
144
|
end
|
144
145
|
end
|
145
146
|
else
|
146
|
-
|
147
|
+
acc = []
|
148
|
+
self.find(intervals) { |block| acc << block }
|
149
|
+
acc
|
147
150
|
end
|
148
151
|
end
|
149
152
|
|
@@ -156,13 +159,14 @@ module Bio
|
|
156
159
|
# @yield [tiler] a {Tiler} ready to operate on the given interval
|
157
160
|
# @api public
|
158
161
|
def tile(interval)
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
162
|
+
with_index(interval.chrom) do |index|
|
163
|
+
with_parser(interval.chrom) do |parser|
|
164
|
+
tiler = Tiler.new
|
165
|
+
tiler.index = index
|
166
|
+
tiler.parser = parser
|
167
|
+
tiler.interval = interval
|
168
|
+
yield tiler
|
169
|
+
end
|
166
170
|
end
|
167
171
|
end
|
168
172
|
|
@@ -172,13 +176,15 @@ module Bio
|
|
172
176
|
#
|
173
177
|
# @param [Bio::GenomicInterval] interval interval to search
|
174
178
|
# @yield [block] each {Block} matched, in turn
|
175
|
-
# @return [
|
179
|
+
# @return [Array<Block>] each matching {Block}, if no block given
|
176
180
|
# @api public
|
177
181
|
# @see KyotoIndex#slice
|
178
182
|
def slice(interval, &blk)
|
179
|
-
|
180
|
-
|
181
|
-
|
183
|
+
with_index(interval.chrom) do |index|
|
184
|
+
with_parser(interval.chrom) do |parser|
|
185
|
+
s = index.slice(interval, parser, block_filter, &blk)
|
186
|
+
block_given? ? s : s.to_a
|
187
|
+
end
|
182
188
|
end
|
183
189
|
end
|
184
190
|
|
@@ -193,12 +199,17 @@ module Bio
|
|
193
199
|
scan_dir(options[:dir])
|
194
200
|
elsif options[:maf]
|
195
201
|
if options[:index]
|
196
|
-
|
202
|
+
LOG.debug { "Opening index file #{options[:index]}" }
|
203
|
+
index = KyotoIndex.open(options[:index])
|
204
|
+
register_index(index,
|
197
205
|
options[:maf])
|
206
|
+
index.close
|
198
207
|
else
|
199
|
-
|
200
|
-
if
|
201
|
-
|
208
|
+
idx_f = find_index_file(options[:maf])
|
209
|
+
if idx_f
|
210
|
+
index = KyotoIndex.open(idx_f)
|
211
|
+
register_index(index, options[:maf])
|
212
|
+
index.close
|
202
213
|
end
|
203
214
|
end
|
204
215
|
else
|
@@ -229,7 +240,11 @@ module Bio
|
|
229
240
|
unless index.maf_file == File.basename(maf)
|
230
241
|
raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
|
231
242
|
end
|
232
|
-
|
243
|
+
if index.path.to_s.start_with? '%'
|
244
|
+
@indices[index.ref_seq] = index
|
245
|
+
else
|
246
|
+
@indices[index.ref_seq] = index.path.to_s
|
247
|
+
end
|
233
248
|
@maf_by_chrom[index.ref_seq] = maf
|
234
249
|
end
|
235
250
|
|
@@ -241,6 +256,7 @@ module Bio
|
|
241
256
|
if File.exist? maf
|
242
257
|
register_index(index, maf)
|
243
258
|
end
|
259
|
+
index.close
|
244
260
|
end
|
245
261
|
end
|
246
262
|
|
@@ -249,7 +265,23 @@ module Bio
|
|
249
265
|
unless @indices.has_key? chrom
|
250
266
|
raise "No index available for chromosome #{chrom}!"
|
251
267
|
end
|
252
|
-
@indices[chrom]
|
268
|
+
index = @indices[chrom]
|
269
|
+
if index.is_a? KyotoIndex
|
270
|
+
# temporary
|
271
|
+
index
|
272
|
+
else
|
273
|
+
KyotoIndex.open(index)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def with_index(chrom)
|
278
|
+
index = chrom_index(chrom)
|
279
|
+
LOG.debug { "Selected index #{index} for sequence #{chrom}." }
|
280
|
+
begin
|
281
|
+
yield index
|
282
|
+
ensure
|
283
|
+
index.close unless index.path.to_s.start_with? '%'
|
284
|
+
end
|
253
285
|
end
|
254
286
|
|
255
287
|
# @api private
|
@@ -403,7 +435,7 @@ module Bio
|
|
403
435
|
def find(intervals, parser, filter={}, &blk)
|
404
436
|
start = Time.now
|
405
437
|
fl = fetch_list(intervals, filter)
|
406
|
-
LOG.debug { sprintf("Built fetch list of %d items in %.3fs
|
438
|
+
LOG.debug { sprintf("Built fetch list of %d items in %.3fs.",
|
407
439
|
fl.size,
|
408
440
|
Time.now - start) }
|
409
441
|
if ! fl.empty?
|
@@ -426,6 +458,7 @@ module Bio
|
|
426
458
|
yield block.slice(interval)
|
427
459
|
end
|
428
460
|
else
|
461
|
+
LOG.debug { "accumulating results of #slice" }
|
429
462
|
enum_for(:slice, interval, parser, filter)
|
430
463
|
end
|
431
464
|
end
|
@@ -436,6 +469,7 @@ module Bio
|
|
436
469
|
def initialize(path, db_arg=nil)
|
437
470
|
@species = {}
|
438
471
|
@species_max_id = -1
|
472
|
+
@index_sequences = {}
|
439
473
|
@max_sid = -1
|
440
474
|
if db_arg || ((path.size > 1) and File.exist?(path))
|
441
475
|
mode = KyotoCabinet::DB::OREADER
|
@@ -444,15 +478,25 @@ module Bio
|
|
444
478
|
end
|
445
479
|
@db = db_arg || KyotoCabinet::DB.new
|
446
480
|
@path = path
|
447
|
-
|
481
|
+
path_str = "#{path.to_s}#opts=ls#dfunit=100000"
|
482
|
+
unless db_arg || db.open(path_str, mode)
|
448
483
|
raise "Could not open DB file!"
|
449
484
|
end
|
450
485
|
if mode == KyotoCabinet::DB::OREADER
|
486
|
+
version = db[FORMAT_VERSION_KEY].to_i
|
487
|
+
if version != FORMAT_VERSION
|
488
|
+
raise "Index #{path} is version #{version}, expecting version #{FORMAT_VERSION}!"
|
489
|
+
end
|
451
490
|
@maf_file = db[FILE_KEY]
|
452
491
|
self.ref_seq = db[REF_SEQ_KEY]
|
453
492
|
load_index_sequences
|
454
493
|
load_species
|
455
494
|
end
|
495
|
+
@mutex = Mutex.new
|
496
|
+
end
|
497
|
+
|
498
|
+
def to_s
|
499
|
+
"#<KyotoIndex path=#{path}>"
|
456
500
|
end
|
457
501
|
|
458
502
|
# Reopen the same DB handle read-only. Only useful for unit tests.
|
@@ -576,6 +620,11 @@ module Bio
|
|
576
620
|
end
|
577
621
|
|
578
622
|
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
623
|
+
LOG.debug {
|
624
|
+
sprintf("Beginning scan of %d bin intervals %s filters.",
|
625
|
+
bin_intervals.size,
|
626
|
+
filters.empty? ? "without" : "with")
|
627
|
+
}
|
579
628
|
start = Time.now
|
580
629
|
n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
|
581
630
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
@@ -603,7 +652,7 @@ module Bio
|
|
603
652
|
n_completed += 1
|
604
653
|
end
|
605
654
|
threads.each { |t| t.join }
|
606
|
-
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds
|
655
|
+
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.",
|
607
656
|
to_fetch.size, n_threads, Time.now - start) }
|
608
657
|
to_fetch
|
609
658
|
end
|
@@ -676,30 +725,55 @@ module Bio
|
|
676
725
|
|| gi.include?(i_start)
|
677
726
|
end
|
678
727
|
|
679
|
-
|
680
|
-
|
728
|
+
CHUNK_THRESHOLD_BYTES = 50 * 1024 * 1024
|
729
|
+
CHUNK_THRESHOLD_BLOCKS = 1000
|
730
|
+
|
731
|
+
def prep(file_spec, compression, ref_only)
|
732
|
+
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
733
|
+
db[FILE_KEY] = File.basename(file_spec)
|
681
734
|
@maf_file = db[FILE_KEY]
|
682
|
-
if
|
683
|
-
db[COMPRESSION_KEY] =
|
735
|
+
if compression
|
736
|
+
db[COMPRESSION_KEY] = compression.to_s
|
684
737
|
end
|
685
|
-
first_block = parser.parse_block
|
686
|
-
self.ref_seq = first_block.sequences.first.source
|
687
738
|
@ref_only = ref_only
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
739
|
+
@seen_first = false
|
740
|
+
end
|
741
|
+
|
742
|
+
def build(parser, ref_only=true)
|
743
|
+
prep(parser.file_spec,
|
744
|
+
parser.compression,
|
745
|
+
ref_only)
|
746
|
+
|
692
747
|
n = 0
|
693
|
-
|
694
|
-
|
695
|
-
|
748
|
+
acc = []
|
749
|
+
acc_bytes = 0
|
750
|
+
parser.each_block do |block|
|
751
|
+
acc << block
|
752
|
+
acc_bytes += block.size
|
753
|
+
if acc_bytes > CHUNK_THRESHOLD_BYTES \
|
754
|
+
|| acc.size > CHUNK_THRESHOLD_BLOCKS
|
755
|
+
index_blocks(acc)
|
756
|
+
acc = []
|
757
|
+
acc_bytes = 0
|
758
|
+
end
|
759
|
+
n += 1
|
696
760
|
end
|
761
|
+
index_blocks(acc)
|
697
762
|
LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
|
698
763
|
db.synchronize(true)
|
699
764
|
end
|
700
765
|
|
701
766
|
def index_blocks(blocks)
|
702
|
-
h =
|
767
|
+
h = @mutex.synchronize do
|
768
|
+
if ! @seen_first
|
769
|
+
# set the reference sequence from the first block
|
770
|
+
first_block = blocks.first
|
771
|
+
self.ref_seq = first_block.sequences.first.source
|
772
|
+
db[REF_SEQ_KEY] = ref_seq
|
773
|
+
@seen_first = true
|
774
|
+
end
|
775
|
+
blocks.map { |b| entries_for(b) }.reduce(:merge!)
|
776
|
+
end
|
703
777
|
db.set_bulk(h, false)
|
704
778
|
end
|
705
779
|
|
@@ -719,8 +793,11 @@ module Bio
|
|
719
793
|
if ! sid
|
720
794
|
@max_sid += 1
|
721
795
|
sid = @max_sid
|
722
|
-
|
723
|
-
|
796
|
+
# "" << foo is hideous but apparently what it takes to get a
|
797
|
+
# non-shared copy of a string on JRuby...
|
798
|
+
name_copy = "" << name
|
799
|
+
db.set("sequence:#{name_copy}", sid.to_s)
|
800
|
+
index_sequences[name_copy] = sid
|
724
801
|
end
|
725
802
|
return sid
|
726
803
|
end
|
@@ -739,22 +816,24 @@ module Bio
|
|
739
816
|
# example: otoGar1.scaffold_104707.1-93001
|
740
817
|
parts = seq.split('.', 2)
|
741
818
|
if parts.size == 2
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
else
|
746
|
-
species_id = @species_max_id + 1
|
747
|
-
if species_id >= MAX_SPECIES
|
748
|
-
raise "cannot index MAF file with more than #{MAX_SPECIES} species"
|
749
|
-
end
|
750
|
-
species[species_name] = species_id
|
751
|
-
db["species:#{species_name}"] = species_id
|
752
|
-
@species_max_id = species_id
|
753
|
-
return species_id
|
754
|
-
end
|
819
|
+
# "" << foo is hideous but apparently what it takes to get a
|
820
|
+
# non-shared copy of a string on JRuby...
|
821
|
+
species_name = "" << parts[0]
|
755
822
|
else
|
756
823
|
# not in species.sequence format, apparently
|
757
|
-
|
824
|
+
species_name = "" << seq
|
825
|
+
end
|
826
|
+
if species.has_key? species_name
|
827
|
+
return species[species_name]
|
828
|
+
else
|
829
|
+
species_id = @species_max_id + 1
|
830
|
+
if species_id >= MAX_SPECIES
|
831
|
+
raise "cannot index MAF file with more than #{MAX_SPECIES} species"
|
832
|
+
end
|
833
|
+
species[species_name] = species_id
|
834
|
+
db["species:#{species_name}"] = species_id
|
835
|
+
@species_max_id = species_id
|
836
|
+
return species_id
|
758
837
|
end
|
759
838
|
end
|
760
839
|
|
@@ -769,20 +848,27 @@ module Bio
|
|
769
848
|
end
|
770
849
|
|
771
850
|
def entries_for(block)
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
851
|
+
begin
|
852
|
+
unless block.ref_seq.source == @ref_seq
|
853
|
+
raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
|
854
|
+
end
|
855
|
+
h = {}
|
856
|
+
val = build_block_value(block)
|
857
|
+
to_index = ref_only ? [block.sequences.first] : block.sequences
|
858
|
+
to_index.each do |seq|
|
859
|
+
seq_id = seq_id_for(seq.source)
|
860
|
+
# size 0 occurs in e.g. upstream1000.maf.gz
|
861
|
+
next if seq.size == 0
|
862
|
+
seq_end = seq.start + seq.size
|
863
|
+
bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
|
864
|
+
key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
|
865
|
+
h[key] = val
|
866
|
+
end
|
867
|
+
return h
|
868
|
+
rescue Exception => e
|
869
|
+
LOG.error "Failed to index block at offset #{block.offset}:\n#{block}"
|
870
|
+
raise e
|
784
871
|
end
|
785
|
-
return h
|
786
872
|
end
|
787
873
|
end # class KyotoIndex
|
788
874
|
|
@@ -861,6 +947,10 @@ module Bio
|
|
861
947
|
@l = l
|
862
948
|
end
|
863
949
|
|
950
|
+
def empty?
|
951
|
+
@l.empty?
|
952
|
+
end
|
953
|
+
|
864
954
|
def match(entry)
|
865
955
|
return ! @l.find { |f| ! f.call(entry) }
|
866
956
|
end
|