bio-maf 1.0.0-java → 1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "bio-maf"
5
- s.version = "1.0.0"
5
+ s.version = "1.0.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Clayton Wheeler"]
9
- s.date = "2012-08-02"
9
+ s.date = "2012-08-08"
10
10
  s.description = "Multiple Alignment Format parser for BioRuby."
11
11
  s.email = "cswh@umich.edu"
12
12
  s.extra_rdoc_files = [
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  end
33
33
 
34
34
  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
35
- s.add_runtime_dependency('bio-bgzf', ["~> 0.2.0"])
35
+ s.add_runtime_dependency('bio-bgzf', ["~> 0.2.1"])
36
36
  s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
37
37
  s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
38
38
  if RUBY_PLATFORM == 'java'
@@ -60,3 +60,48 @@ Feature: BGZF compression
60
60
  And a file named "mm8_chr7_tiny.maf.bgz" should exist
61
61
  And a file named "mm8.chrM.maf.bgz" should exist
62
62
 
63
+ @no_jruby
64
+ Scenario: Don't overwrite MAF files
65
+ Given test files:
66
+ | mm8.chrM.maf |
67
+ | mm8.chrM.maf.bgz |
68
+ When I run `maf_bgzip mm8.chrM.maf`
69
+ Then it should fail with:
70
+ """
71
+ exists
72
+ """
73
+
74
+ @no_jruby
75
+ Scenario: Don't overwrite indexes
76
+ Given test files:
77
+ | mm8_chr7_tiny.maf |
78
+ When I run `maf_bgzip --index mm8_chr7_tiny.maf`
79
+ And I run `rm mm8_chr7_tiny.maf.bgz`
80
+ And I run `maf_bgzip --index mm8_chr7_tiny.maf`
81
+ Then it should fail with:
82
+ """
83
+ exists
84
+ """
85
+
86
+ @no_jruby
87
+ Scenario: Overwrite MAF files with --force
88
+ Given test files:
89
+ | mm8.chrM.maf |
90
+ | mm8.chrM.maf.bgz |
91
+ When I run `maf_bgzip --force mm8.chrM.maf`
92
+ Then it should pass with:
93
+ """
94
+ """
95
+
96
+ @no_jruby
97
+ Scenario: Overwrite indexes with --force
98
+ Given test files:
99
+ | mm8_chr7_tiny.maf |
100
+ When I run `maf_bgzip --index mm8_chr7_tiny.maf`
101
+ And I run `rm mm8_chr7_tiny.maf.bgz`
102
+ And I run `maf_bgzip --force --index mm8_chr7_tiny.maf`
103
+ Then it should pass with:
104
+ """
105
+ """
106
+
107
+
@@ -39,6 +39,12 @@ Feature: Indexed access to MAF files
39
39
  And sequence mm8.chr7 of block 0 has start 80082368
40
40
  And sequence mm8.chr7 of block 1 has start 80082471
41
41
 
42
+ Scenario: Index MAF file with extended bin positions
43
+ Given a MAF source file "ext-bin.maf"
44
+ When I open it with a MAF reader
45
+ And build an index on all sequences
46
+ Then the index has at least 18 entries
47
+
42
48
  @no_jruby
43
49
  Scenario: Build MAF index with CLI tool
44
50
  Given test files:
@@ -42,3 +42,20 @@ Feature: Parse MAF files
42
42
  And sequence 0 has text "ACA-TTACT"
43
43
  And sequence 1 has strand :-
44
44
 
45
+ Scenario: Read alignment block, folded to upper case
46
+ Given MAF data:
47
+ """
48
+ ##maf version=1 scoring=humor.v4
49
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
50
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
51
+
52
+ a score=0.128
53
+ s human_hoxa 100 8 + 100257 aca-ttact
54
+ s horse_hoxa 120 9 - 98892 acaattgct
55
+ s fugu_hoxa 88 7 + 90788 aca--tgct
56
+ """
57
+ When I enable the :upcase parser option
58
+ And I open it with a MAF reader
59
+ Then an alignment block can be obtained
60
+ And the alignment block has 3 sequences
61
+ And sequence 0 has text "ACA-TTACT"
@@ -82,3 +82,14 @@ Feature: Filter results from MAF files
82
82
  And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
83
83
  And I run `diff m1.maf m2.maf`
84
84
  Then the exit status should be 0
85
+
86
+ @no_jruby
87
+ Scenario: One-based indexing with maf_extract
88
+ Given test files:
89
+ | mm8_chr7_tiny.maf |
90
+ | mm8_chr7_tiny.kct |
91
+ When I run `sh -c 'maf_extract -d . --one-based --interval mm8.chr7:80082592-80082713 | grep "^a" | wc -l'`
92
+ Then it should pass with:
93
+ """
94
+ 2
95
+ """
@@ -18,6 +18,17 @@ Feature: MAF slicing
18
18
  And write all the matched blocks
19
19
  Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
20
20
 
21
+ Scenario: Interval covering two blocks, using directory access, counting
22
+ Given indexed MAF files in "test/data"
23
+ When I enable the :remove_gaps parser option
24
+ And filter for only the species
25
+ | mm8 |
26
+ | rn4 |
27
+ And I extract a slice over the genomic interval
28
+ | chrom | start | end |
29
+ | mm8.chr7 | 80082350 | 80082380 |
30
+ Then 2 blocks are obtained
31
+
21
32
  Scenario: Interval covering two blocks, using directory access
22
33
  Given indexed MAF files in "test/data"
23
34
  When I enable the :remove_gaps parser option
@@ -30,6 +30,7 @@ end
30
30
  Then /^an alignment block can be obtained$/ do
31
31
  @block = @parser.parse_block
32
32
  @block.should_not be_nil
33
+ @block.is_a?(Bio::MAF::Block).should be_true
33
34
  end
34
35
 
35
36
  Then /^the alignment block has (\d+) sequences$/ do |n_seq|
@@ -160,7 +160,7 @@ Feature: Join alignment blocks with reference data
160
160
  | gap-sp1.fa.gz |
161
161
  | gap-1.maf |
162
162
  | gap-1.kct |
163
- When I run `maf_tile --reference gap-sp1.fa.gz --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
163
+ When I run `maf_tile --reference gap-sp1.fa.gz --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
164
164
  Then it should pass with:
165
165
  """
166
166
  >mouse
@@ -176,7 +176,7 @@ Feature: Join alignment blocks with reference data
176
176
  Given test files:
177
177
  | gap-1.maf |
178
178
  | gap-1.kct |
179
- When I run `maf_tile --interval 0:50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
179
+ When I run `maf_tile --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
180
180
  Then it should pass with:
181
181
  """
182
182
  >mouse
@@ -198,7 +198,10 @@ Feature: Join alignment blocks with reference data
198
198
  sp1.chr1 12 36
199
199
  """
200
200
  When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
201
- Then the file "selected_12-36.fa" should contain exactly:
201
+ Then it should pass with:
202
+ """
203
+ """
204
+ And the file "selected_12-36.fa" should contain exactly:
202
205
  """
203
206
  >mouse
204
207
  GCTGAGGGC--AGTTGTGTCAGGGCG
@@ -214,7 +217,7 @@ Feature: Join alignment blocks with reference data
214
217
  Given test files:
215
218
  | mm8_chr7_tiny.maf |
216
219
  | mm8_chr7_tiny.kct |
217
- When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334:80082344 mm8_chr7_tiny.maf`
220
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334-80082344 mm8_chr7_tiny.maf`
218
221
  Then it should pass with:
219
222
  """
220
223
  >mm8
@@ -230,7 +233,7 @@ Feature: Join alignment blocks with reference data
230
233
  Given test files:
231
234
  | mm8_chr7_tiny.maf |
232
235
  | mm8_chr7_tiny.kct |
233
- When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334:80082344 .`
236
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334-80082344 .`
234
237
  Then it should pass with:
235
238
  """
236
239
  >mm8
@@ -241,3 +244,18 @@ Feature: Join alignment blocks with reference data
241
244
  --------GG
242
245
  """
243
246
 
247
+ @no_jruby
248
+ Scenario: Tile with CLI tool and directory, 1-based
249
+ Given test files:
250
+ | mm8_chr7_tiny.maf |
251
+ | mm8_chr7_tiny.kct |
252
+ When I run `maf_tile -s mm8 -s rn4 -s hg18 --one-based --interval mm8.chr7:80082335-80082344 .`
253
+ Then it should pass with:
254
+ """
255
+ >mm8
256
+ GGGCTGAGGG
257
+ >rn4
258
+ GGGCTGAGGG
259
+ >hg18
260
+ --------GG
261
+ """
@@ -11,7 +11,11 @@
11
11
  require 'bio-logger'
12
12
  log = Bio::Log::LoggerPlus.new('bio-maf')
13
13
  log.outputters = Bio::Log::Outputter.stderr
14
- log.level = Bio::Log::WARN
14
+ log.level = if ENV['BIO_MAF_DEBUG']
15
+ Bio::Log::DEBUG
16
+ else
17
+ Bio::Log::WARN
18
+ end
15
19
 
16
20
  require 'bio/ucsc'
17
21
  require 'bio/maf'
@@ -4,3 +4,4 @@ require 'bio/maf/index'
4
4
  require 'bio/maf/parser'
5
5
  require 'bio/maf/writer'
6
6
  require 'bio/maf/tiler'
7
+ require 'bio/maf/jobs'
@@ -125,7 +125,7 @@ module Bio
125
125
  # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
126
126
  # intervals to parse.
127
127
  # @yield [block] each {Block} matched, in turn
128
- # @return [Enumerable<Block>] each matching {Block}, if no block given
128
+ # @return [Array<Block>] each matching {Block}, if no block given
129
129
  # @api public
130
130
  # @see KyotoIndex#find
131
131
  def find(intervals, &blk)
@@ -137,13 +137,16 @@ module Bio
137
137
  end
138
138
  end
139
139
  by_chrom.each do |chrom, c_intervals|
140
- index = @indices[chrom]
141
- with_parser(chrom) do |parser|
142
- index.find(c_intervals, parser, block_filter, &blk)
140
+ with_index(chrom) do |index|
141
+ with_parser(chrom) do |parser|
142
+ index.find(c_intervals, parser, block_filter, &blk)
143
+ end
143
144
  end
144
145
  end
145
146
  else
146
- enum_for(:find, intervals)
147
+ acc = []
148
+ self.find(intervals) { |block| acc << block }
149
+ acc
147
150
  end
148
151
  end
149
152
 
@@ -156,13 +159,14 @@ module Bio
156
159
  # @yield [tiler] a {Tiler} ready to operate on the given interval
157
160
  # @api public
158
161
  def tile(interval)
159
- index = chrom_index(interval.chrom)
160
- with_parser(interval.chrom) do |parser|
161
- tiler = Tiler.new
162
- tiler.index = index
163
- tiler.parser = parser
164
- tiler.interval = interval
165
- yield tiler
162
+ with_index(interval.chrom) do |index|
163
+ with_parser(interval.chrom) do |parser|
164
+ tiler = Tiler.new
165
+ tiler.index = index
166
+ tiler.parser = parser
167
+ tiler.interval = interval
168
+ yield tiler
169
+ end
166
170
  end
167
171
  end
168
172
 
@@ -172,13 +176,15 @@ module Bio
172
176
  #
173
177
  # @param [Bio::GenomicInterval] interval interval to search
174
178
  # @yield [block] each {Block} matched, in turn
175
- # @return [Enumerable<Block>] each matching {Block}, if no block given
179
+ # @return [Array<Block>] each matching {Block}, if no block given
176
180
  # @api public
177
181
  # @see KyotoIndex#slice
178
182
  def slice(interval, &blk)
179
- index = chrom_index(interval.chrom)
180
- with_parser(interval.chrom) do |parser|
181
- index.slice(interval, parser, &blk)
183
+ with_index(interval.chrom) do |index|
184
+ with_parser(interval.chrom) do |parser|
185
+ s = index.slice(interval, parser, block_filter, &blk)
186
+ block_given? ? s : s.to_a
187
+ end
182
188
  end
183
189
  end
184
190
 
@@ -193,12 +199,17 @@ module Bio
193
199
  scan_dir(options[:dir])
194
200
  elsif options[:maf]
195
201
  if options[:index]
196
- register_index(KyotoIndex.open(options[:index]),
202
+ LOG.debug { "Opening index file #{options[:index]}" }
203
+ index = KyotoIndex.open(options[:index])
204
+ register_index(index,
197
205
  options[:maf])
206
+ index.close
198
207
  else
199
- idx = find_index_file(options[:maf])
200
- if idx
201
- register_index(KyotoIndex.open(idx), options[:maf])
208
+ idx_f = find_index_file(options[:maf])
209
+ if idx_f
210
+ index = KyotoIndex.open(idx_f)
211
+ register_index(index, options[:maf])
212
+ index.close
202
213
  end
203
214
  end
204
215
  else
@@ -229,7 +240,11 @@ module Bio
229
240
  unless index.maf_file == File.basename(maf)
230
241
  raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
231
242
  end
232
- @indices[index.ref_seq] = index
243
+ if index.path.to_s.start_with? '%'
244
+ @indices[index.ref_seq] = index
245
+ else
246
+ @indices[index.ref_seq] = index.path.to_s
247
+ end
233
248
  @maf_by_chrom[index.ref_seq] = maf
234
249
  end
235
250
 
@@ -241,6 +256,7 @@ module Bio
241
256
  if File.exist? maf
242
257
  register_index(index, maf)
243
258
  end
259
+ index.close
244
260
  end
245
261
  end
246
262
 
@@ -249,7 +265,23 @@ module Bio
249
265
  unless @indices.has_key? chrom
250
266
  raise "No index available for chromosome #{chrom}!"
251
267
  end
252
- @indices[chrom]
268
+ index = @indices[chrom]
269
+ if index.is_a? KyotoIndex
270
+ # temporary
271
+ index
272
+ else
273
+ KyotoIndex.open(index)
274
+ end
275
+ end
276
+
277
+ def with_index(chrom)
278
+ index = chrom_index(chrom)
279
+ LOG.debug { "Selected index #{index} for sequence #{chrom}." }
280
+ begin
281
+ yield index
282
+ ensure
283
+ index.close unless index.path.to_s.start_with? '%'
284
+ end
253
285
  end
254
286
 
255
287
  # @api private
@@ -403,7 +435,7 @@ module Bio
403
435
  def find(intervals, parser, filter={}, &blk)
404
436
  start = Time.now
405
437
  fl = fetch_list(intervals, filter)
406
- LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
438
+ LOG.debug { sprintf("Built fetch list of %d items in %.3fs.",
407
439
  fl.size,
408
440
  Time.now - start) }
409
441
  if ! fl.empty?
@@ -426,6 +458,7 @@ module Bio
426
458
  yield block.slice(interval)
427
459
  end
428
460
  else
461
+ LOG.debug { "accumulating results of #slice" }
429
462
  enum_for(:slice, interval, parser, filter)
430
463
  end
431
464
  end
@@ -436,6 +469,7 @@ module Bio
436
469
  def initialize(path, db_arg=nil)
437
470
  @species = {}
438
471
  @species_max_id = -1
472
+ @index_sequences = {}
439
473
  @max_sid = -1
440
474
  if db_arg || ((path.size > 1) and File.exist?(path))
441
475
  mode = KyotoCabinet::DB::OREADER
@@ -444,15 +478,25 @@ module Bio
444
478
  end
445
479
  @db = db_arg || KyotoCabinet::DB.new
446
480
  @path = path
447
- unless db_arg || db.open(path.to_s, mode)
481
+ path_str = "#{path.to_s}#opts=ls#dfunit=100000"
482
+ unless db_arg || db.open(path_str, mode)
448
483
  raise "Could not open DB file!"
449
484
  end
450
485
  if mode == KyotoCabinet::DB::OREADER
486
+ version = db[FORMAT_VERSION_KEY].to_i
487
+ if version != FORMAT_VERSION
488
+ raise "Index #{path} is version #{version}, expecting version #{FORMAT_VERSION}!"
489
+ end
451
490
  @maf_file = db[FILE_KEY]
452
491
  self.ref_seq = db[REF_SEQ_KEY]
453
492
  load_index_sequences
454
493
  load_species
455
494
  end
495
+ @mutex = Mutex.new
496
+ end
497
+
498
+ def to_s
499
+ "#<KyotoIndex path=#{path}>"
456
500
  end
457
501
 
458
502
  # Reopen the same DB handle read-only. Only useful for unit tests.
@@ -576,6 +620,11 @@ module Bio
576
620
  end
577
621
 
578
622
  def scan_bins_parallel(chrom_id, bin_intervals, filters)
623
+ LOG.debug {
624
+ sprintf("Beginning scan of %d bin intervals %s filters.",
625
+ bin_intervals.size,
626
+ filters.empty? ? "without" : "with")
627
+ }
579
628
  start = Time.now
580
629
  n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
581
630
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
@@ -603,7 +652,7 @@ module Bio
603
652
  n_completed += 1
604
653
  end
605
654
  threads.each { |t| t.join }
606
- LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
655
+ LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.",
607
656
  to_fetch.size, n_threads, Time.now - start) }
608
657
  to_fetch
609
658
  end
@@ -676,30 +725,55 @@ module Bio
676
725
  || gi.include?(i_start)
677
726
  end
678
727
 
679
- def build(parser, ref_only=true)
680
- db[FILE_KEY] = File.basename(parser.file_spec)
728
+ CHUNK_THRESHOLD_BYTES = 50 * 1024 * 1024
729
+ CHUNK_THRESHOLD_BLOCKS = 1000
730
+
731
+ def prep(file_spec, compression, ref_only)
732
+ db[FORMAT_VERSION_KEY] = FORMAT_VERSION
733
+ db[FILE_KEY] = File.basename(file_spec)
681
734
  @maf_file = db[FILE_KEY]
682
- if parser.compression
683
- db[COMPRESSION_KEY] = parser.compression.to_s
735
+ if compression
736
+ db[COMPRESSION_KEY] = compression.to_s
684
737
  end
685
- first_block = parser.parse_block
686
- self.ref_seq = first_block.sequences.first.source
687
738
  @ref_only = ref_only
688
- db[REF_SEQ_KEY] = ref_seq
689
- db[FORMAT_VERSION_KEY] = FORMAT_VERSION
690
- @index_sequences = {}
691
- index_blocks([first_block])
739
+ @seen_first = false
740
+ end
741
+
742
+ def build(parser, ref_only=true)
743
+ prep(parser.file_spec,
744
+ parser.compression,
745
+ ref_only)
746
+
692
747
  n = 0
693
- parser.each_block.each_slice(1000).each do |blocks|
694
- index_blocks(blocks)
695
- n += blocks.size
748
+ acc = []
749
+ acc_bytes = 0
750
+ parser.each_block do |block|
751
+ acc << block
752
+ acc_bytes += block.size
753
+ if acc_bytes > CHUNK_THRESHOLD_BYTES \
754
+ || acc.size > CHUNK_THRESHOLD_BLOCKS
755
+ index_blocks(acc)
756
+ acc = []
757
+ acc_bytes = 0
758
+ end
759
+ n += 1
696
760
  end
761
+ index_blocks(acc)
697
762
  LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
698
763
  db.synchronize(true)
699
764
  end
700
765
 
701
766
  def index_blocks(blocks)
702
- h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
767
+ h = @mutex.synchronize do
768
+ if ! @seen_first
769
+ # set the reference sequence from the first block
770
+ first_block = blocks.first
771
+ self.ref_seq = first_block.sequences.first.source
772
+ db[REF_SEQ_KEY] = ref_seq
773
+ @seen_first = true
774
+ end
775
+ blocks.map { |b| entries_for(b) }.reduce(:merge!)
776
+ end
703
777
  db.set_bulk(h, false)
704
778
  end
705
779
 
@@ -719,8 +793,11 @@ module Bio
719
793
  if ! sid
720
794
  @max_sid += 1
721
795
  sid = @max_sid
722
- db.set("sequence:#{name}", sid.to_s)
723
- index_sequences[name] = sid
796
+ # "" << foo is hideous but apparently what it takes to get a
797
+ # non-shared copy of a string on JRuby...
798
+ name_copy = "" << name
799
+ db.set("sequence:#{name_copy}", sid.to_s)
800
+ index_sequences[name_copy] = sid
724
801
  end
725
802
  return sid
726
803
  end
@@ -739,22 +816,24 @@ module Bio
739
816
  # example: otoGar1.scaffold_104707.1-93001
740
817
  parts = seq.split('.', 2)
741
818
  if parts.size == 2
742
- species_name = parts[0]
743
- if species.has_key? species_name
744
- return species[species_name]
745
- else
746
- species_id = @species_max_id + 1
747
- if species_id >= MAX_SPECIES
748
- raise "cannot index MAF file with more than #{MAX_SPECIES} species"
749
- end
750
- species[species_name] = species_id
751
- db["species:#{species_name}"] = species_id
752
- @species_max_id = species_id
753
- return species_id
754
- end
819
+ # "" << foo is hideous but apparently what it takes to get a
820
+ # non-shared copy of a string on JRuby...
821
+ species_name = "" << parts[0]
755
822
  else
756
823
  # not in species.sequence format, apparently
757
- return nil
824
+ species_name = "" << seq
825
+ end
826
+ if species.has_key? species_name
827
+ return species[species_name]
828
+ else
829
+ species_id = @species_max_id + 1
830
+ if species_id >= MAX_SPECIES
831
+ raise "cannot index MAF file with more than #{MAX_SPECIES} species"
832
+ end
833
+ species[species_name] = species_id
834
+ db["species:#{species_name}"] = species_id
835
+ @species_max_id = species_id
836
+ return species_id
758
837
  end
759
838
  end
760
839
 
@@ -769,20 +848,27 @@ module Bio
769
848
  end
770
849
 
771
850
  def entries_for(block)
772
- unless block.ref_seq.source == @ref_seq
773
- raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
774
- end
775
- h = {}
776
- val = build_block_value(block)
777
- to_index = ref_only ? [block.sequences.first] : block.sequences
778
- to_index.each do |seq|
779
- seq_id = seq_id_for(seq.source)
780
- seq_end = seq.start + seq.size
781
- bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
782
- key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
783
- h[key] = val
851
+ begin
852
+ unless block.ref_seq.source == @ref_seq
853
+ raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
854
+ end
855
+ h = {}
856
+ val = build_block_value(block)
857
+ to_index = ref_only ? [block.sequences.first] : block.sequences
858
+ to_index.each do |seq|
859
+ seq_id = seq_id_for(seq.source)
860
+ # size 0 occurs in e.g. upstream1000.maf.gz
861
+ next if seq.size == 0
862
+ seq_end = seq.start + seq.size
863
+ bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
864
+ key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
865
+ h[key] = val
866
+ end
867
+ return h
868
+ rescue Exception => e
869
+ LOG.error "Failed to index block at offset #{block.offset}:\n#{block}"
870
+ raise e
784
871
  end
785
- return h
786
872
  end
787
873
  end # class KyotoIndex
788
874
 
@@ -861,6 +947,10 @@ module Bio
861
947
  @l = l
862
948
  end
863
949
 
950
+ def empty?
951
+ @l.empty?
952
+ end
953
+
864
954
  def match(entry)
865
955
  return ! @l.find { |f| ! f.call(entry) }
866
956
  end