bio-maf 1.0.0-java → 1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/bio-maf.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "1.0.
|
5
|
+
s.version = "1.0.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-08-
|
9
|
+
s.date = "2012-08-08"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
12
|
s.extra_rdoc_files = [
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
end
|
33
33
|
|
34
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
35
|
-
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.
|
35
|
+
s.add_runtime_dependency('bio-bgzf', ["~> 0.2.1"])
|
36
36
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
37
37
|
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
38
38
|
if RUBY_PLATFORM == 'java'
|
data/features/bgzf.feature
CHANGED
@@ -60,3 +60,48 @@ Feature: BGZF compression
|
|
60
60
|
And a file named "mm8_chr7_tiny.maf.bgz" should exist
|
61
61
|
And a file named "mm8.chrM.maf.bgz" should exist
|
62
62
|
|
63
|
+
@no_jruby
|
64
|
+
Scenario: Don't overwrite MAF files
|
65
|
+
Given test files:
|
66
|
+
| mm8.chrM.maf |
|
67
|
+
| mm8.chrM.maf.bgz |
|
68
|
+
When I run `maf_bgzip mm8.chrM.maf`
|
69
|
+
Then it should fail with:
|
70
|
+
"""
|
71
|
+
exists
|
72
|
+
"""
|
73
|
+
|
74
|
+
@no_jruby
|
75
|
+
Scenario: Don't overwrite indexes
|
76
|
+
Given test files:
|
77
|
+
| mm8_chr7_tiny.maf |
|
78
|
+
When I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
79
|
+
And I run `rm mm8_chr7_tiny.maf.bgz`
|
80
|
+
And I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
81
|
+
Then it should fail with:
|
82
|
+
"""
|
83
|
+
exists
|
84
|
+
"""
|
85
|
+
|
86
|
+
@no_jruby
|
87
|
+
Scenario: Overwrite MAF files with --force
|
88
|
+
Given test files:
|
89
|
+
| mm8.chrM.maf |
|
90
|
+
| mm8.chrM.maf.bgz |
|
91
|
+
When I run `maf_bgzip --force mm8.chrM.maf`
|
92
|
+
Then it should pass with:
|
93
|
+
"""
|
94
|
+
"""
|
95
|
+
|
96
|
+
@no_jruby
|
97
|
+
Scenario: Overwrite indexes with --force
|
98
|
+
Given test files:
|
99
|
+
| mm8_chr7_tiny.maf |
|
100
|
+
When I run `maf_bgzip --index mm8_chr7_tiny.maf`
|
101
|
+
And I run `rm mm8_chr7_tiny.maf.bgz`
|
102
|
+
And I run `maf_bgzip --force --index mm8_chr7_tiny.maf`
|
103
|
+
Then it should pass with:
|
104
|
+
"""
|
105
|
+
"""
|
106
|
+
|
107
|
+
|
@@ -39,6 +39,12 @@ Feature: Indexed access to MAF files
|
|
39
39
|
And sequence mm8.chr7 of block 0 has start 80082368
|
40
40
|
And sequence mm8.chr7 of block 1 has start 80082471
|
41
41
|
|
42
|
+
Scenario: Index MAF file with extended bin positions
|
43
|
+
Given a MAF source file "ext-bin.maf"
|
44
|
+
When I open it with a MAF reader
|
45
|
+
And build an index on all sequences
|
46
|
+
Then the index has at least 18 entries
|
47
|
+
|
42
48
|
@no_jruby
|
43
49
|
Scenario: Build MAF index with CLI tool
|
44
50
|
Given test files:
|
@@ -42,3 +42,20 @@ Feature: Parse MAF files
|
|
42
42
|
And sequence 0 has text "ACA-TTACT"
|
43
43
|
And sequence 1 has strand :-
|
44
44
|
|
45
|
+
Scenario: Read alignment block, folded to upper case
|
46
|
+
Given MAF data:
|
47
|
+
"""
|
48
|
+
##maf version=1 scoring=humor.v4
|
49
|
+
# humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
|
50
|
+
# /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
|
51
|
+
|
52
|
+
a score=0.128
|
53
|
+
s human_hoxa 100 8 + 100257 aca-ttact
|
54
|
+
s horse_hoxa 120 9 - 98892 acaattgct
|
55
|
+
s fugu_hoxa 88 7 + 90788 aca--tgct
|
56
|
+
"""
|
57
|
+
When I enable the :upcase parser option
|
58
|
+
And I open it with a MAF reader
|
59
|
+
Then an alignment block can be obtained
|
60
|
+
And the alignment block has 3 sequences
|
61
|
+
And sequence 0 has text "ACA-TTACT"
|
@@ -82,3 +82,14 @@ Feature: Filter results from MAF files
|
|
82
82
|
And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
|
83
83
|
And I run `diff m1.maf m2.maf`
|
84
84
|
Then the exit status should be 0
|
85
|
+
|
86
|
+
@no_jruby
|
87
|
+
Scenario: One-based indexing with maf_extract
|
88
|
+
Given test files:
|
89
|
+
| mm8_chr7_tiny.maf |
|
90
|
+
| mm8_chr7_tiny.kct |
|
91
|
+
When I run `sh -c 'maf_extract -d . --one-based --interval mm8.chr7:80082592-80082713 | grep "^a" | wc -l'`
|
92
|
+
Then it should pass with:
|
93
|
+
"""
|
94
|
+
2
|
95
|
+
"""
|
data/features/slice.feature
CHANGED
@@ -18,6 +18,17 @@ Feature: MAF slicing
|
|
18
18
|
And write all the matched blocks
|
19
19
|
Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
|
20
20
|
|
21
|
+
Scenario: Interval covering two blocks, using directory access, counting
|
22
|
+
Given indexed MAF files in "test/data"
|
23
|
+
When I enable the :remove_gaps parser option
|
24
|
+
And filter for only the species
|
25
|
+
| mm8 |
|
26
|
+
| rn4 |
|
27
|
+
And I extract a slice over the genomic interval
|
28
|
+
| chrom | start | end |
|
29
|
+
| mm8.chr7 | 80082350 | 80082380 |
|
30
|
+
Then 2 blocks are obtained
|
31
|
+
|
21
32
|
Scenario: Interval covering two blocks, using directory access
|
22
33
|
Given indexed MAF files in "test/data"
|
23
34
|
When I enable the :remove_gaps parser option
|
data/features/tiling.feature
CHANGED
@@ -160,7 +160,7 @@ Feature: Join alignment blocks with reference data
|
|
160
160
|
| gap-sp1.fa.gz |
|
161
161
|
| gap-1.maf |
|
162
162
|
| gap-1.kct |
|
163
|
-
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0
|
163
|
+
When I run `maf_tile --reference gap-sp1.fa.gz --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
164
164
|
Then it should pass with:
|
165
165
|
"""
|
166
166
|
>mouse
|
@@ -176,7 +176,7 @@ Feature: Join alignment blocks with reference data
|
|
176
176
|
Given test files:
|
177
177
|
| gap-1.maf |
|
178
178
|
| gap-1.kct |
|
179
|
-
When I run `maf_tile --interval 0
|
179
|
+
When I run `maf_tile --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
|
180
180
|
Then it should pass with:
|
181
181
|
"""
|
182
182
|
>mouse
|
@@ -198,7 +198,10 @@ Feature: Join alignment blocks with reference data
|
|
198
198
|
sp1.chr1 12 36
|
199
199
|
"""
|
200
200
|
When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
|
201
|
-
Then
|
201
|
+
Then it should pass with:
|
202
|
+
"""
|
203
|
+
"""
|
204
|
+
And the file "selected_12-36.fa" should contain exactly:
|
202
205
|
"""
|
203
206
|
>mouse
|
204
207
|
GCTGAGGGC--AGTTGTGTCAGGGCG
|
@@ -214,7 +217,7 @@ Feature: Join alignment blocks with reference data
|
|
214
217
|
Given test files:
|
215
218
|
| mm8_chr7_tiny.maf |
|
216
219
|
| mm8_chr7_tiny.kct |
|
217
|
-
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334
|
220
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334-80082344 mm8_chr7_tiny.maf`
|
218
221
|
Then it should pass with:
|
219
222
|
"""
|
220
223
|
>mm8
|
@@ -230,7 +233,7 @@ Feature: Join alignment blocks with reference data
|
|
230
233
|
Given test files:
|
231
234
|
| mm8_chr7_tiny.maf |
|
232
235
|
| mm8_chr7_tiny.kct |
|
233
|
-
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334
|
236
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334-80082344 .`
|
234
237
|
Then it should pass with:
|
235
238
|
"""
|
236
239
|
>mm8
|
@@ -241,3 +244,18 @@ Feature: Join alignment blocks with reference data
|
|
241
244
|
--------GG
|
242
245
|
"""
|
243
246
|
|
247
|
+
@no_jruby
|
248
|
+
Scenario: Tile with CLI tool and directory, 1-based
|
249
|
+
Given test files:
|
250
|
+
| mm8_chr7_tiny.maf |
|
251
|
+
| mm8_chr7_tiny.kct |
|
252
|
+
When I run `maf_tile -s mm8 -s rn4 -s hg18 --one-based --interval mm8.chr7:80082335-80082344 .`
|
253
|
+
Then it should pass with:
|
254
|
+
"""
|
255
|
+
>mm8
|
256
|
+
GGGCTGAGGG
|
257
|
+
>rn4
|
258
|
+
GGGCTGAGGG
|
259
|
+
>hg18
|
260
|
+
--------GG
|
261
|
+
"""
|
data/lib/bio-maf.rb
CHANGED
@@ -11,7 +11,11 @@
|
|
11
11
|
require 'bio-logger'
|
12
12
|
log = Bio::Log::LoggerPlus.new('bio-maf')
|
13
13
|
log.outputters = Bio::Log::Outputter.stderr
|
14
|
-
log.level =
|
14
|
+
log.level = if ENV['BIO_MAF_DEBUG']
|
15
|
+
Bio::Log::DEBUG
|
16
|
+
else
|
17
|
+
Bio::Log::WARN
|
18
|
+
end
|
15
19
|
|
16
20
|
require 'bio/ucsc'
|
17
21
|
require 'bio/maf'
|
data/lib/bio/maf.rb
CHANGED
data/lib/bio/maf/index.rb
CHANGED
@@ -125,7 +125,7 @@ module Bio
|
|
125
125
|
# @param [Enumerable<Bio::GenomicInterval>] intervals genomic
|
126
126
|
# intervals to parse.
|
127
127
|
# @yield [block] each {Block} matched, in turn
|
128
|
-
# @return [
|
128
|
+
# @return [Array<Block>] each matching {Block}, if no block given
|
129
129
|
# @api public
|
130
130
|
# @see KyotoIndex#find
|
131
131
|
def find(intervals, &blk)
|
@@ -137,13 +137,16 @@ module Bio
|
|
137
137
|
end
|
138
138
|
end
|
139
139
|
by_chrom.each do |chrom, c_intervals|
|
140
|
-
|
141
|
-
|
142
|
-
|
140
|
+
with_index(chrom) do |index|
|
141
|
+
with_parser(chrom) do |parser|
|
142
|
+
index.find(c_intervals, parser, block_filter, &blk)
|
143
|
+
end
|
143
144
|
end
|
144
145
|
end
|
145
146
|
else
|
146
|
-
|
147
|
+
acc = []
|
148
|
+
self.find(intervals) { |block| acc << block }
|
149
|
+
acc
|
147
150
|
end
|
148
151
|
end
|
149
152
|
|
@@ -156,13 +159,14 @@ module Bio
|
|
156
159
|
# @yield [tiler] a {Tiler} ready to operate on the given interval
|
157
160
|
# @api public
|
158
161
|
def tile(interval)
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
162
|
+
with_index(interval.chrom) do |index|
|
163
|
+
with_parser(interval.chrom) do |parser|
|
164
|
+
tiler = Tiler.new
|
165
|
+
tiler.index = index
|
166
|
+
tiler.parser = parser
|
167
|
+
tiler.interval = interval
|
168
|
+
yield tiler
|
169
|
+
end
|
166
170
|
end
|
167
171
|
end
|
168
172
|
|
@@ -172,13 +176,15 @@ module Bio
|
|
172
176
|
#
|
173
177
|
# @param [Bio::GenomicInterval] interval interval to search
|
174
178
|
# @yield [block] each {Block} matched, in turn
|
175
|
-
# @return [
|
179
|
+
# @return [Array<Block>] each matching {Block}, if no block given
|
176
180
|
# @api public
|
177
181
|
# @see KyotoIndex#slice
|
178
182
|
def slice(interval, &blk)
|
179
|
-
|
180
|
-
|
181
|
-
|
183
|
+
with_index(interval.chrom) do |index|
|
184
|
+
with_parser(interval.chrom) do |parser|
|
185
|
+
s = index.slice(interval, parser, block_filter, &blk)
|
186
|
+
block_given? ? s : s.to_a
|
187
|
+
end
|
182
188
|
end
|
183
189
|
end
|
184
190
|
|
@@ -193,12 +199,17 @@ module Bio
|
|
193
199
|
scan_dir(options[:dir])
|
194
200
|
elsif options[:maf]
|
195
201
|
if options[:index]
|
196
|
-
|
202
|
+
LOG.debug { "Opening index file #{options[:index]}" }
|
203
|
+
index = KyotoIndex.open(options[:index])
|
204
|
+
register_index(index,
|
197
205
|
options[:maf])
|
206
|
+
index.close
|
198
207
|
else
|
199
|
-
|
200
|
-
if
|
201
|
-
|
208
|
+
idx_f = find_index_file(options[:maf])
|
209
|
+
if idx_f
|
210
|
+
index = KyotoIndex.open(idx_f)
|
211
|
+
register_index(index, options[:maf])
|
212
|
+
index.close
|
202
213
|
end
|
203
214
|
end
|
204
215
|
else
|
@@ -229,7 +240,11 @@ module Bio
|
|
229
240
|
unless index.maf_file == File.basename(maf)
|
230
241
|
raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
|
231
242
|
end
|
232
|
-
|
243
|
+
if index.path.to_s.start_with? '%'
|
244
|
+
@indices[index.ref_seq] = index
|
245
|
+
else
|
246
|
+
@indices[index.ref_seq] = index.path.to_s
|
247
|
+
end
|
233
248
|
@maf_by_chrom[index.ref_seq] = maf
|
234
249
|
end
|
235
250
|
|
@@ -241,6 +256,7 @@ module Bio
|
|
241
256
|
if File.exist? maf
|
242
257
|
register_index(index, maf)
|
243
258
|
end
|
259
|
+
index.close
|
244
260
|
end
|
245
261
|
end
|
246
262
|
|
@@ -249,7 +265,23 @@ module Bio
|
|
249
265
|
unless @indices.has_key? chrom
|
250
266
|
raise "No index available for chromosome #{chrom}!"
|
251
267
|
end
|
252
|
-
@indices[chrom]
|
268
|
+
index = @indices[chrom]
|
269
|
+
if index.is_a? KyotoIndex
|
270
|
+
# temporary
|
271
|
+
index
|
272
|
+
else
|
273
|
+
KyotoIndex.open(index)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def with_index(chrom)
|
278
|
+
index = chrom_index(chrom)
|
279
|
+
LOG.debug { "Selected index #{index} for sequence #{chrom}." }
|
280
|
+
begin
|
281
|
+
yield index
|
282
|
+
ensure
|
283
|
+
index.close unless index.path.to_s.start_with? '%'
|
284
|
+
end
|
253
285
|
end
|
254
286
|
|
255
287
|
# @api private
|
@@ -403,7 +435,7 @@ module Bio
|
|
403
435
|
def find(intervals, parser, filter={}, &blk)
|
404
436
|
start = Time.now
|
405
437
|
fl = fetch_list(intervals, filter)
|
406
|
-
LOG.debug { sprintf("Built fetch list of %d items in %.3fs
|
438
|
+
LOG.debug { sprintf("Built fetch list of %d items in %.3fs.",
|
407
439
|
fl.size,
|
408
440
|
Time.now - start) }
|
409
441
|
if ! fl.empty?
|
@@ -426,6 +458,7 @@ module Bio
|
|
426
458
|
yield block.slice(interval)
|
427
459
|
end
|
428
460
|
else
|
461
|
+
LOG.debug { "accumulating results of #slice" }
|
429
462
|
enum_for(:slice, interval, parser, filter)
|
430
463
|
end
|
431
464
|
end
|
@@ -436,6 +469,7 @@ module Bio
|
|
436
469
|
def initialize(path, db_arg=nil)
|
437
470
|
@species = {}
|
438
471
|
@species_max_id = -1
|
472
|
+
@index_sequences = {}
|
439
473
|
@max_sid = -1
|
440
474
|
if db_arg || ((path.size > 1) and File.exist?(path))
|
441
475
|
mode = KyotoCabinet::DB::OREADER
|
@@ -444,15 +478,25 @@ module Bio
|
|
444
478
|
end
|
445
479
|
@db = db_arg || KyotoCabinet::DB.new
|
446
480
|
@path = path
|
447
|
-
|
481
|
+
path_str = "#{path.to_s}#opts=ls#dfunit=100000"
|
482
|
+
unless db_arg || db.open(path_str, mode)
|
448
483
|
raise "Could not open DB file!"
|
449
484
|
end
|
450
485
|
if mode == KyotoCabinet::DB::OREADER
|
486
|
+
version = db[FORMAT_VERSION_KEY].to_i
|
487
|
+
if version != FORMAT_VERSION
|
488
|
+
raise "Index #{path} is version #{version}, expecting version #{FORMAT_VERSION}!"
|
489
|
+
end
|
451
490
|
@maf_file = db[FILE_KEY]
|
452
491
|
self.ref_seq = db[REF_SEQ_KEY]
|
453
492
|
load_index_sequences
|
454
493
|
load_species
|
455
494
|
end
|
495
|
+
@mutex = Mutex.new
|
496
|
+
end
|
497
|
+
|
498
|
+
def to_s
|
499
|
+
"#<KyotoIndex path=#{path}>"
|
456
500
|
end
|
457
501
|
|
458
502
|
# Reopen the same DB handle read-only. Only useful for unit tests.
|
@@ -576,6 +620,11 @@ module Bio
|
|
576
620
|
end
|
577
621
|
|
578
622
|
def scan_bins_parallel(chrom_id, bin_intervals, filters)
|
623
|
+
LOG.debug {
|
624
|
+
sprintf("Beginning scan of %d bin intervals %s filters.",
|
625
|
+
bin_intervals.size,
|
626
|
+
filters.empty? ? "without" : "with")
|
627
|
+
}
|
579
628
|
start = Time.now
|
580
629
|
n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
|
581
630
|
jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
|
@@ -603,7 +652,7 @@ module Bio
|
|
603
652
|
n_completed += 1
|
604
653
|
end
|
605
654
|
threads.each { |t| t.join }
|
606
|
-
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds
|
655
|
+
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.",
|
607
656
|
to_fetch.size, n_threads, Time.now - start) }
|
608
657
|
to_fetch
|
609
658
|
end
|
@@ -676,30 +725,55 @@ module Bio
|
|
676
725
|
|| gi.include?(i_start)
|
677
726
|
end
|
678
727
|
|
679
|
-
|
680
|
-
|
728
|
+
CHUNK_THRESHOLD_BYTES = 50 * 1024 * 1024
|
729
|
+
CHUNK_THRESHOLD_BLOCKS = 1000
|
730
|
+
|
731
|
+
def prep(file_spec, compression, ref_only)
|
732
|
+
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
733
|
+
db[FILE_KEY] = File.basename(file_spec)
|
681
734
|
@maf_file = db[FILE_KEY]
|
682
|
-
if
|
683
|
-
db[COMPRESSION_KEY] =
|
735
|
+
if compression
|
736
|
+
db[COMPRESSION_KEY] = compression.to_s
|
684
737
|
end
|
685
|
-
first_block = parser.parse_block
|
686
|
-
self.ref_seq = first_block.sequences.first.source
|
687
738
|
@ref_only = ref_only
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
739
|
+
@seen_first = false
|
740
|
+
end
|
741
|
+
|
742
|
+
def build(parser, ref_only=true)
|
743
|
+
prep(parser.file_spec,
|
744
|
+
parser.compression,
|
745
|
+
ref_only)
|
746
|
+
|
692
747
|
n = 0
|
693
|
-
|
694
|
-
|
695
|
-
|
748
|
+
acc = []
|
749
|
+
acc_bytes = 0
|
750
|
+
parser.each_block do |block|
|
751
|
+
acc << block
|
752
|
+
acc_bytes += block.size
|
753
|
+
if acc_bytes > CHUNK_THRESHOLD_BYTES \
|
754
|
+
|| acc.size > CHUNK_THRESHOLD_BLOCKS
|
755
|
+
index_blocks(acc)
|
756
|
+
acc = []
|
757
|
+
acc_bytes = 0
|
758
|
+
end
|
759
|
+
n += 1
|
696
760
|
end
|
761
|
+
index_blocks(acc)
|
697
762
|
LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
|
698
763
|
db.synchronize(true)
|
699
764
|
end
|
700
765
|
|
701
766
|
def index_blocks(blocks)
|
702
|
-
h =
|
767
|
+
h = @mutex.synchronize do
|
768
|
+
if ! @seen_first
|
769
|
+
# set the reference sequence from the first block
|
770
|
+
first_block = blocks.first
|
771
|
+
self.ref_seq = first_block.sequences.first.source
|
772
|
+
db[REF_SEQ_KEY] = ref_seq
|
773
|
+
@seen_first = true
|
774
|
+
end
|
775
|
+
blocks.map { |b| entries_for(b) }.reduce(:merge!)
|
776
|
+
end
|
703
777
|
db.set_bulk(h, false)
|
704
778
|
end
|
705
779
|
|
@@ -719,8 +793,11 @@ module Bio
|
|
719
793
|
if ! sid
|
720
794
|
@max_sid += 1
|
721
795
|
sid = @max_sid
|
722
|
-
|
723
|
-
|
796
|
+
# "" << foo is hideous but apparently what it takes to get a
|
797
|
+
# non-shared copy of a string on JRuby...
|
798
|
+
name_copy = "" << name
|
799
|
+
db.set("sequence:#{name_copy}", sid.to_s)
|
800
|
+
index_sequences[name_copy] = sid
|
724
801
|
end
|
725
802
|
return sid
|
726
803
|
end
|
@@ -739,22 +816,24 @@ module Bio
|
|
739
816
|
# example: otoGar1.scaffold_104707.1-93001
|
740
817
|
parts = seq.split('.', 2)
|
741
818
|
if parts.size == 2
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
else
|
746
|
-
species_id = @species_max_id + 1
|
747
|
-
if species_id >= MAX_SPECIES
|
748
|
-
raise "cannot index MAF file with more than #{MAX_SPECIES} species"
|
749
|
-
end
|
750
|
-
species[species_name] = species_id
|
751
|
-
db["species:#{species_name}"] = species_id
|
752
|
-
@species_max_id = species_id
|
753
|
-
return species_id
|
754
|
-
end
|
819
|
+
# "" << foo is hideous but apparently what it takes to get a
|
820
|
+
# non-shared copy of a string on JRuby...
|
821
|
+
species_name = "" << parts[0]
|
755
822
|
else
|
756
823
|
# not in species.sequence format, apparently
|
757
|
-
|
824
|
+
species_name = "" << seq
|
825
|
+
end
|
826
|
+
if species.has_key? species_name
|
827
|
+
return species[species_name]
|
828
|
+
else
|
829
|
+
species_id = @species_max_id + 1
|
830
|
+
if species_id >= MAX_SPECIES
|
831
|
+
raise "cannot index MAF file with more than #{MAX_SPECIES} species"
|
832
|
+
end
|
833
|
+
species[species_name] = species_id
|
834
|
+
db["species:#{species_name}"] = species_id
|
835
|
+
@species_max_id = species_id
|
836
|
+
return species_id
|
758
837
|
end
|
759
838
|
end
|
760
839
|
|
@@ -769,20 +848,27 @@ module Bio
|
|
769
848
|
end
|
770
849
|
|
771
850
|
def entries_for(block)
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
851
|
+
begin
|
852
|
+
unless block.ref_seq.source == @ref_seq
|
853
|
+
raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
|
854
|
+
end
|
855
|
+
h = {}
|
856
|
+
val = build_block_value(block)
|
857
|
+
to_index = ref_only ? [block.sequences.first] : block.sequences
|
858
|
+
to_index.each do |seq|
|
859
|
+
seq_id = seq_id_for(seq.source)
|
860
|
+
# size 0 occurs in e.g. upstream1000.maf.gz
|
861
|
+
next if seq.size == 0
|
862
|
+
seq_end = seq.start + seq.size
|
863
|
+
bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
|
864
|
+
key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
|
865
|
+
h[key] = val
|
866
|
+
end
|
867
|
+
return h
|
868
|
+
rescue Exception => e
|
869
|
+
LOG.error "Failed to index block at offset #{block.offset}:\n#{block}"
|
870
|
+
raise e
|
784
871
|
end
|
785
|
-
return h
|
786
872
|
end
|
787
873
|
end # class KyotoIndex
|
788
874
|
|
@@ -861,6 +947,10 @@ module Bio
|
|
861
947
|
@l = l
|
862
948
|
end
|
863
949
|
|
950
|
+
def empty?
|
951
|
+
@l.empty?
|
952
|
+
end
|
953
|
+
|
864
954
|
def match(entry)
|
865
955
|
return ! @l.find { |f| ! f.call(entry) }
|
866
956
|
end
|