bio-maf 1.0.0-java → 1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
    
        data/bio-maf.gemspec
    CHANGED
    
    | @@ -2,11 +2,11 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            Gem::Specification.new do |s|
         | 
| 4 4 | 
             
              s.name = "bio-maf"
         | 
| 5 | 
            -
              s.version = "1.0. | 
| 5 | 
            +
              s.version = "1.0.1"
         | 
| 6 6 |  | 
| 7 7 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 8 8 | 
             
              s.authors = ["Clayton Wheeler"]
         | 
| 9 | 
            -
              s.date = "2012-08- | 
| 9 | 
            +
              s.date = "2012-08-08"
         | 
| 10 10 | 
             
              s.description = "Multiple Alignment Format parser for BioRuby."
         | 
| 11 11 | 
             
              s.email = "cswh@umich.edu"
         | 
| 12 12 | 
             
              s.extra_rdoc_files = [
         | 
| @@ -32,7 +32,7 @@ Gem::Specification.new do |s| | |
| 32 32 | 
             
              end
         | 
| 33 33 |  | 
| 34 34 | 
             
              s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
         | 
| 35 | 
            -
              s.add_runtime_dependency('bio-bgzf', ["~> 0.2. | 
| 35 | 
            +
              s.add_runtime_dependency('bio-bgzf', ["~> 0.2.1"])
         | 
| 36 36 | 
             
              s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
         | 
| 37 37 | 
             
              s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
         | 
| 38 38 | 
             
              if RUBY_PLATFORM == 'java'
         | 
    
        data/features/bgzf.feature
    CHANGED
    
    | @@ -60,3 +60,48 @@ Feature: BGZF compression | |
| 60 60 | 
             
                And a file named "mm8_chr7_tiny.maf.bgz" should exist
         | 
| 61 61 | 
             
                And a file named "mm8.chrM.maf.bgz" should exist
         | 
| 62 62 |  | 
| 63 | 
            +
              @no_jruby
         | 
| 64 | 
            +
              Scenario: Don't overwrite MAF files
         | 
| 65 | 
            +
                Given test files:
         | 
| 66 | 
            +
                | mm8.chrM.maf      |
         | 
| 67 | 
            +
                | mm8.chrM.maf.bgz  |
         | 
| 68 | 
            +
                When I run `maf_bgzip mm8.chrM.maf`
         | 
| 69 | 
            +
                Then it should fail with:
         | 
| 70 | 
            +
                """
         | 
| 71 | 
            +
                exists
         | 
| 72 | 
            +
                """
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              @no_jruby
         | 
| 75 | 
            +
              Scenario: Don't overwrite indexes
         | 
| 76 | 
            +
                Given test files:
         | 
| 77 | 
            +
                | mm8_chr7_tiny.maf |
         | 
| 78 | 
            +
                When I run `maf_bgzip --index mm8_chr7_tiny.maf`
         | 
| 79 | 
            +
                 And I run `rm mm8_chr7_tiny.maf.bgz`
         | 
| 80 | 
            +
                 And I run `maf_bgzip --index mm8_chr7_tiny.maf`
         | 
| 81 | 
            +
                Then it should fail with:
         | 
| 82 | 
            +
                """
         | 
| 83 | 
            +
                exists
         | 
| 84 | 
            +
                """
         | 
| 85 | 
            +
             | 
| 86 | 
            +
              @no_jruby
         | 
| 87 | 
            +
              Scenario: Overwrite MAF files with --force
         | 
| 88 | 
            +
                Given test files:
         | 
| 89 | 
            +
                | mm8.chrM.maf      |
         | 
| 90 | 
            +
                | mm8.chrM.maf.bgz  |
         | 
| 91 | 
            +
                When I run `maf_bgzip --force mm8.chrM.maf`
         | 
| 92 | 
            +
                Then it should pass with:
         | 
| 93 | 
            +
                """
         | 
| 94 | 
            +
                """
         | 
| 95 | 
            +
             | 
| 96 | 
            +
              @no_jruby
         | 
| 97 | 
            +
              Scenario: Overwrite indexes with --force
         | 
| 98 | 
            +
                Given test files:
         | 
| 99 | 
            +
                | mm8_chr7_tiny.maf |
         | 
| 100 | 
            +
                When I run `maf_bgzip --index mm8_chr7_tiny.maf`
         | 
| 101 | 
            +
                 And I run `rm mm8_chr7_tiny.maf.bgz`
         | 
| 102 | 
            +
                 And I run `maf_bgzip --force --index mm8_chr7_tiny.maf`
         | 
| 103 | 
            +
                Then it should pass with:
         | 
| 104 | 
            +
                """
         | 
| 105 | 
            +
                """
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                
         | 
| @@ -39,6 +39,12 @@ Feature: Indexed access to MAF files | |
| 39 39 | 
             
                And sequence mm8.chr7 of block 0 has start 80082368
         | 
| 40 40 | 
             
                And sequence mm8.chr7 of block 1 has start 80082471
         | 
| 41 41 |  | 
| 42 | 
            +
              Scenario: Index MAF file with extended bin positions
         | 
| 43 | 
            +
                Given a MAF source file "ext-bin.maf"
         | 
| 44 | 
            +
                When I open it with a MAF reader
         | 
| 45 | 
            +
                And build an index on all sequences
         | 
| 46 | 
            +
                Then the index has at least 18 entries
         | 
| 47 | 
            +
                
         | 
| 42 48 | 
             
              @no_jruby
         | 
| 43 49 | 
             
              Scenario: Build MAF index with CLI tool
         | 
| 44 50 | 
             
                Given test files:
         | 
| @@ -42,3 +42,20 @@ Feature: Parse MAF files | |
| 42 42 | 
             
                And sequence 0 has text "ACA-TTACT"
         | 
| 43 43 | 
             
                And sequence 1 has strand :-
         | 
| 44 44 |  | 
| 45 | 
            +
              Scenario: Read alignment block, folded to upper case
         | 
| 46 | 
            +
                Given MAF data:
         | 
| 47 | 
            +
                """
         | 
| 48 | 
            +
                ##maf version=1 scoring=humor.v4
         | 
| 49 | 
            +
                # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
         | 
| 50 | 
            +
                # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                a score=0.128
         | 
| 53 | 
            +
                s human_hoxa 100  8 + 100257 aca-ttact
         | 
| 54 | 
            +
                s horse_hoxa 120  9 -  98892 acaattgct
         | 
| 55 | 
            +
                s fugu_hoxa   88  7  + 90788 aca--tgct
         | 
| 56 | 
            +
                """
         | 
| 57 | 
            +
                When I enable the :upcase parser option
         | 
| 58 | 
            +
                And I open it with a MAF reader
         | 
| 59 | 
            +
                Then an alignment block can be obtained
         | 
| 60 | 
            +
                And the alignment block has 3 sequences
         | 
| 61 | 
            +
                And sequence 0 has text "ACA-TTACT"
         | 
| @@ -82,3 +82,14 @@ Feature: Filter results from MAF files | |
| 82 82 | 
             
                And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
         | 
| 83 83 | 
             
                And I run `diff m1.maf m2.maf`
         | 
| 84 84 | 
             
                Then the exit status should be 0
         | 
| 85 | 
            +
             | 
| 86 | 
            +
              @no_jruby
         | 
| 87 | 
            +
              Scenario: One-based indexing with maf_extract
         | 
| 88 | 
            +
                Given test files:
         | 
| 89 | 
            +
                | mm8_chr7_tiny.maf |
         | 
| 90 | 
            +
                | mm8_chr7_tiny.kct |
         | 
| 91 | 
            +
                When I run `sh -c 'maf_extract -d . --one-based --interval mm8.chr7:80082592-80082713 | grep "^a" | wc -l'`
         | 
| 92 | 
            +
                Then it should pass with:
         | 
| 93 | 
            +
                """
         | 
| 94 | 
            +
                2
         | 
| 95 | 
            +
                """
         | 
    
        data/features/slice.feature
    CHANGED
    
    | @@ -18,6 +18,17 @@ Feature: MAF slicing | |
| 18 18 | 
             
                And write all the matched blocks
         | 
| 19 19 | 
             
                Then the output should match, except whitespace, "mm8_chr7_tiny_slice1.maf"
         | 
| 20 20 |  | 
| 21 | 
            +
              Scenario: Interval covering two blocks, using directory access, counting
         | 
| 22 | 
            +
                Given indexed MAF files in "test/data"
         | 
| 23 | 
            +
                When I enable the :remove_gaps parser option
         | 
| 24 | 
            +
                And filter for only the species
         | 
| 25 | 
            +
                  | mm8 |
         | 
| 26 | 
            +
                  | rn4 |
         | 
| 27 | 
            +
                And I extract a slice over the genomic interval
         | 
| 28 | 
            +
                  | chrom    |    start |      end |
         | 
| 29 | 
            +
                  | mm8.chr7 | 80082350 | 80082380 |
         | 
| 30 | 
            +
                Then 2 blocks are obtained
         | 
| 31 | 
            +
             | 
| 21 32 | 
             
              Scenario: Interval covering two blocks, using directory access
         | 
| 22 33 | 
             
                Given indexed MAF files in "test/data"
         | 
| 23 34 | 
             
                When I enable the :remove_gaps parser option
         | 
    
        data/features/tiling.feature
    CHANGED
    
    | @@ -160,7 +160,7 @@ Feature: Join alignment blocks with reference data | |
| 160 160 | 
             
                | gap-sp1.fa.gz |
         | 
| 161 161 | 
             
                | gap-1.maf     |
         | 
| 162 162 | 
             
                | gap-1.kct     |
         | 
| 163 | 
            -
                When I run `maf_tile --reference gap-sp1.fa.gz --interval 0 | 
| 163 | 
            +
                When I run `maf_tile --reference gap-sp1.fa.gz --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
         | 
| 164 164 | 
             
                Then it should pass with:
         | 
| 165 165 | 
             
                """
         | 
| 166 166 | 
             
                >mouse
         | 
| @@ -176,7 +176,7 @@ Feature: Join alignment blocks with reference data | |
| 176 176 | 
             
                Given test files:
         | 
| 177 177 | 
             
                | gap-1.maf     |
         | 
| 178 178 | 
             
                | gap-1.kct     |
         | 
| 179 | 
            -
                When I run `maf_tile --interval 0 | 
| 179 | 
            +
                When I run `maf_tile --interval 0-50 -s sp1:mouse -s sp2:nautilus -s sp3:jaguar gap-1.maf gap-1.kct`
         | 
| 180 180 | 
             
                Then it should pass with:
         | 
| 181 181 | 
             
                """
         | 
| 182 182 | 
             
                >mouse
         | 
| @@ -198,7 +198,10 @@ Feature: Join alignment blocks with reference data | |
| 198 198 | 
             
                sp1.chr1 12 36
         | 
| 199 199 | 
             
                """
         | 
| 200 200 | 
             
                When I run `maf_tile -s sp1:mouse -s sp2:nautilus -s sp3:jaguar --output-base selected --bed example.bed --reference gap-sp1.fa.gz gap-1.maf gap-1.kct`
         | 
| 201 | 
            -
                Then  | 
| 201 | 
            +
                Then it should pass with:
         | 
| 202 | 
            +
                """
         | 
| 203 | 
            +
                """
         | 
| 204 | 
            +
                And the file "selected_12-36.fa" should contain exactly:
         | 
| 202 205 | 
             
                """
         | 
| 203 206 | 
             
                >mouse
         | 
| 204 207 | 
             
                GCTGAGGGC--AGTTGTGTCAGGGCG
         | 
| @@ -214,7 +217,7 @@ Feature: Join alignment blocks with reference data | |
| 214 217 | 
             
                Given test files:
         | 
| 215 218 | 
             
                | mm8_chr7_tiny.maf |
         | 
| 216 219 | 
             
                | mm8_chr7_tiny.kct |
         | 
| 217 | 
            -
                When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334 | 
| 220 | 
            +
                When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval 80082334-80082344 mm8_chr7_tiny.maf`
         | 
| 218 221 | 
             
                Then it should pass with:
         | 
| 219 222 | 
             
                """
         | 
| 220 223 | 
             
                >mm8
         | 
| @@ -230,7 +233,7 @@ Feature: Join alignment blocks with reference data | |
| 230 233 | 
             
                Given test files:
         | 
| 231 234 | 
             
                | mm8_chr7_tiny.maf |
         | 
| 232 235 | 
             
                | mm8_chr7_tiny.kct |
         | 
| 233 | 
            -
                When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334 | 
| 236 | 
            +
                When I run `maf_tile -s mm8 -s rn4 -s hg18 --interval mm8.chr7:80082334-80082344 .`
         | 
| 234 237 | 
             
                Then it should pass with:
         | 
| 235 238 | 
             
                """
         | 
| 236 239 | 
             
                >mm8
         | 
| @@ -241,3 +244,18 @@ Feature: Join alignment blocks with reference data | |
| 241 244 | 
             
                --------GG
         | 
| 242 245 | 
             
                """
         | 
| 243 246 |  | 
| 247 | 
            +
              @no_jruby
         | 
| 248 | 
            +
              Scenario: Tile with CLI tool and directory, 1-based
         | 
| 249 | 
            +
                Given test files:
         | 
| 250 | 
            +
                | mm8_chr7_tiny.maf |
         | 
| 251 | 
            +
                | mm8_chr7_tiny.kct |
         | 
| 252 | 
            +
                When I run `maf_tile -s mm8 -s rn4 -s hg18 --one-based --interval mm8.chr7:80082335-80082344 .`
         | 
| 253 | 
            +
                Then it should pass with:
         | 
| 254 | 
            +
                """
         | 
| 255 | 
            +
                >mm8
         | 
| 256 | 
            +
                GGGCTGAGGG
         | 
| 257 | 
            +
                >rn4
         | 
| 258 | 
            +
                GGGCTGAGGG
         | 
| 259 | 
            +
                >hg18
         | 
| 260 | 
            +
                --------GG
         | 
| 261 | 
            +
                """
         | 
    
        data/lib/bio-maf.rb
    CHANGED
    
    | @@ -11,7 +11,11 @@ | |
| 11 11 | 
             
            require 'bio-logger'
         | 
| 12 12 | 
             
            log = Bio::Log::LoggerPlus.new('bio-maf')
         | 
| 13 13 | 
             
            log.outputters = Bio::Log::Outputter.stderr
         | 
| 14 | 
            -
            log.level =  | 
| 14 | 
            +
            log.level = if ENV['BIO_MAF_DEBUG']
         | 
| 15 | 
            +
                          Bio::Log::DEBUG
         | 
| 16 | 
            +
                        else
         | 
| 17 | 
            +
                          Bio::Log::WARN
         | 
| 18 | 
            +
                        end
         | 
| 15 19 |  | 
| 16 20 | 
             
            require 'bio/ucsc'
         | 
| 17 21 | 
             
            require 'bio/maf'
         | 
    
        data/lib/bio/maf.rb
    CHANGED
    
    
    
        data/lib/bio/maf/index.rb
    CHANGED
    
    | @@ -125,7 +125,7 @@ module Bio | |
| 125 125 | 
             
                  # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
         | 
| 126 126 | 
             
                  #  intervals to parse.
         | 
| 127 127 | 
             
                  # @yield [block] each {Block} matched, in turn
         | 
| 128 | 
            -
                  # @return [ | 
| 128 | 
            +
                  # @return [Array<Block>] each matching {Block}, if no block given
         | 
| 129 129 | 
             
                  # @api public
         | 
| 130 130 | 
             
                  # @see KyotoIndex#find
         | 
| 131 131 | 
             
                  def find(intervals, &blk)
         | 
| @@ -137,13 +137,16 @@ module Bio | |
| 137 137 | 
             
                        end
         | 
| 138 138 | 
             
                      end
         | 
| 139 139 | 
             
                      by_chrom.each do |chrom, c_intervals|
         | 
| 140 | 
            -
                         | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 140 | 
            +
                        with_index(chrom) do |index|
         | 
| 141 | 
            +
                          with_parser(chrom) do |parser|
         | 
| 142 | 
            +
                            index.find(c_intervals, parser, block_filter, &blk)
         | 
| 143 | 
            +
                          end
         | 
| 143 144 | 
             
                        end
         | 
| 144 145 | 
             
                      end
         | 
| 145 146 | 
             
                    else
         | 
| 146 | 
            -
                       | 
| 147 | 
            +
                      acc = []
         | 
| 148 | 
            +
                      self.find(intervals) { |block| acc << block }
         | 
| 149 | 
            +
                      acc
         | 
| 147 150 | 
             
                    end
         | 
| 148 151 | 
             
                  end
         | 
| 149 152 |  | 
| @@ -156,13 +159,14 @@ module Bio | |
| 156 159 | 
             
                  # @yield [tiler] a {Tiler} ready to operate on the given interval
         | 
| 157 160 | 
             
                  # @api public
         | 
| 158 161 | 
             
                  def tile(interval)
         | 
| 159 | 
            -
                     | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 162 | 
            +
                    with_index(interval.chrom) do |index|
         | 
| 163 | 
            +
                      with_parser(interval.chrom) do |parser|
         | 
| 164 | 
            +
                        tiler = Tiler.new
         | 
| 165 | 
            +
                        tiler.index = index
         | 
| 166 | 
            +
                        tiler.parser = parser
         | 
| 167 | 
            +
                        tiler.interval = interval
         | 
| 168 | 
            +
                        yield tiler
         | 
| 169 | 
            +
                      end
         | 
| 166 170 | 
             
                    end
         | 
| 167 171 | 
             
                  end
         | 
| 168 172 |  | 
| @@ -172,13 +176,15 @@ module Bio | |
| 172 176 | 
             
                  #
         | 
| 173 177 | 
             
                  # @param [Bio::GenomicInterval] interval interval to search
         | 
| 174 178 | 
             
                  # @yield [block] each {Block} matched, in turn
         | 
| 175 | 
            -
                  # @return [ | 
| 179 | 
            +
                  # @return [Array<Block>] each matching {Block}, if no block given
         | 
| 176 180 | 
             
                  # @api public
         | 
| 177 181 | 
             
                  # @see KyotoIndex#slice
         | 
| 178 182 | 
             
                  def slice(interval, &blk)
         | 
| 179 | 
            -
                     | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 183 | 
            +
                    with_index(interval.chrom) do |index|
         | 
| 184 | 
            +
                      with_parser(interval.chrom) do |parser|
         | 
| 185 | 
            +
                        s = index.slice(interval, parser, block_filter, &blk)
         | 
| 186 | 
            +
                        block_given? ? s : s.to_a
         | 
| 187 | 
            +
                      end
         | 
| 182 188 | 
             
                    end
         | 
| 183 189 | 
             
                  end
         | 
| 184 190 |  | 
| @@ -193,12 +199,17 @@ module Bio | |
| 193 199 | 
             
                      scan_dir(options[:dir])
         | 
| 194 200 | 
             
                    elsif options[:maf]
         | 
| 195 201 | 
             
                      if options[:index]
         | 
| 196 | 
            -
                         | 
| 202 | 
            +
                        LOG.debug { "Opening index file #{options[:index]}" }
         | 
| 203 | 
            +
                        index = KyotoIndex.open(options[:index])
         | 
| 204 | 
            +
                        register_index(index,
         | 
| 197 205 | 
             
                                       options[:maf])
         | 
| 206 | 
            +
                        index.close
         | 
| 198 207 | 
             
                      else
         | 
| 199 | 
            -
                         | 
| 200 | 
            -
                        if  | 
| 201 | 
            -
                           | 
| 208 | 
            +
                        idx_f = find_index_file(options[:maf])
         | 
| 209 | 
            +
                        if idx_f
         | 
| 210 | 
            +
                          index = KyotoIndex.open(idx_f)
         | 
| 211 | 
            +
                          register_index(index, options[:maf])
         | 
| 212 | 
            +
                          index.close
         | 
| 202 213 | 
             
                        end
         | 
| 203 214 | 
             
                      end
         | 
| 204 215 | 
             
                    else
         | 
| @@ -229,7 +240,11 @@ module Bio | |
| 229 240 | 
             
                    unless index.maf_file == File.basename(maf)
         | 
| 230 241 | 
             
                      raise "Index #{index.path} was created for #{index.maf_file}, not #{File.basename(maf)}!"
         | 
| 231 242 | 
             
                    end
         | 
| 232 | 
            -
                     | 
| 243 | 
            +
                    if index.path.to_s.start_with? '%'
         | 
| 244 | 
            +
                      @indices[index.ref_seq] = index
         | 
| 245 | 
            +
                    else
         | 
| 246 | 
            +
                      @indices[index.ref_seq] = index.path.to_s
         | 
| 247 | 
            +
                    end
         | 
| 233 248 | 
             
                    @maf_by_chrom[index.ref_seq] = maf
         | 
| 234 249 | 
             
                  end
         | 
| 235 250 |  | 
| @@ -241,6 +256,7 @@ module Bio | |
| 241 256 | 
             
                      if File.exist? maf
         | 
| 242 257 | 
             
                        register_index(index, maf)
         | 
| 243 258 | 
             
                      end
         | 
| 259 | 
            +
                      index.close
         | 
| 244 260 | 
             
                    end
         | 
| 245 261 | 
             
                  end
         | 
| 246 262 |  | 
| @@ -249,7 +265,23 @@ module Bio | |
| 249 265 | 
             
                    unless @indices.has_key? chrom
         | 
| 250 266 | 
             
                      raise "No index available for chromosome #{chrom}!"
         | 
| 251 267 | 
             
                    end
         | 
| 252 | 
            -
                    @indices[chrom]
         | 
| 268 | 
            +
                    index = @indices[chrom]
         | 
| 269 | 
            +
                    if index.is_a? KyotoIndex
         | 
| 270 | 
            +
                      # temporary
         | 
| 271 | 
            +
                      index
         | 
| 272 | 
            +
                    else
         | 
| 273 | 
            +
                      KyotoIndex.open(index)
         | 
| 274 | 
            +
                    end
         | 
| 275 | 
            +
                  end
         | 
| 276 | 
            +
             | 
| 277 | 
            +
                  def with_index(chrom)
         | 
| 278 | 
            +
                    index = chrom_index(chrom)
         | 
| 279 | 
            +
                    LOG.debug { "Selected index #{index} for sequence #{chrom}." }
         | 
| 280 | 
            +
                    begin
         | 
| 281 | 
            +
                      yield index
         | 
| 282 | 
            +
                    ensure
         | 
| 283 | 
            +
                      index.close unless index.path.to_s.start_with? '%'
         | 
| 284 | 
            +
                    end
         | 
| 253 285 | 
             
                  end
         | 
| 254 286 |  | 
| 255 287 | 
             
                  # @api private
         | 
| @@ -403,7 +435,7 @@ module Bio | |
| 403 435 | 
             
                  def find(intervals, parser, filter={}, &blk)
         | 
| 404 436 | 
             
                    start = Time.now
         | 
| 405 437 | 
             
                    fl = fetch_list(intervals, filter)
         | 
| 406 | 
            -
                    LOG.debug { sprintf("Built fetch list of %d items in %.3fs | 
| 438 | 
            +
                    LOG.debug { sprintf("Built fetch list of %d items in %.3fs.",
         | 
| 407 439 | 
             
                                        fl.size,
         | 
| 408 440 | 
             
                                        Time.now - start) }
         | 
| 409 441 | 
             
                    if ! fl.empty?
         | 
| @@ -426,6 +458,7 @@ module Bio | |
| 426 458 | 
             
                        yield block.slice(interval)
         | 
| 427 459 | 
             
                      end
         | 
| 428 460 | 
             
                    else
         | 
| 461 | 
            +
                      LOG.debug { "accumulating results of #slice" }
         | 
| 429 462 | 
             
                      enum_for(:slice, interval, parser, filter)
         | 
| 430 463 | 
             
                    end
         | 
| 431 464 | 
             
                  end
         | 
| @@ -436,6 +469,7 @@ module Bio | |
| 436 469 | 
             
                  def initialize(path, db_arg=nil)
         | 
| 437 470 | 
             
                    @species = {}
         | 
| 438 471 | 
             
                    @species_max_id = -1
         | 
| 472 | 
            +
                    @index_sequences = {}
         | 
| 439 473 | 
             
                    @max_sid = -1
         | 
| 440 474 | 
             
                    if db_arg || ((path.size > 1) and File.exist?(path))
         | 
| 441 475 | 
             
                      mode = KyotoCabinet::DB::OREADER
         | 
| @@ -444,15 +478,25 @@ module Bio | |
| 444 478 | 
             
                    end
         | 
| 445 479 | 
             
                    @db = db_arg || KyotoCabinet::DB.new
         | 
| 446 480 | 
             
                    @path = path
         | 
| 447 | 
            -
                     | 
| 481 | 
            +
                    path_str = "#{path.to_s}#opts=ls#dfunit=100000"
         | 
| 482 | 
            +
                    unless db_arg || db.open(path_str, mode)
         | 
| 448 483 | 
             
                      raise "Could not open DB file!"
         | 
| 449 484 | 
             
                    end
         | 
| 450 485 | 
             
                    if mode == KyotoCabinet::DB::OREADER
         | 
| 486 | 
            +
                      version = db[FORMAT_VERSION_KEY].to_i
         | 
| 487 | 
            +
                      if version != FORMAT_VERSION
         | 
| 488 | 
            +
                        raise "Index #{path} is version #{version}, expecting version #{FORMAT_VERSION}!"
         | 
| 489 | 
            +
                      end
         | 
| 451 490 | 
             
                      @maf_file = db[FILE_KEY]
         | 
| 452 491 | 
             
                      self.ref_seq = db[REF_SEQ_KEY]
         | 
| 453 492 | 
             
                      load_index_sequences
         | 
| 454 493 | 
             
                      load_species
         | 
| 455 494 | 
             
                    end
         | 
| 495 | 
            +
                    @mutex = Mutex.new
         | 
| 496 | 
            +
                  end
         | 
| 497 | 
            +
             | 
| 498 | 
            +
                  def to_s
         | 
| 499 | 
            +
                    "#<KyotoIndex path=#{path}>"
         | 
| 456 500 | 
             
                  end
         | 
| 457 501 |  | 
| 458 502 | 
             
                  # Reopen the same DB handle read-only. Only useful for unit tests.
         | 
| @@ -576,6 +620,11 @@ module Bio | |
| 576 620 | 
             
                  end
         | 
| 577 621 |  | 
| 578 622 | 
             
                  def scan_bins_parallel(chrom_id, bin_intervals, filters)
         | 
| 623 | 
            +
                    LOG.debug {
         | 
| 624 | 
            +
                      sprintf("Beginning scan of %d bin intervals %s filters.",
         | 
| 625 | 
            +
                              bin_intervals.size,
         | 
| 626 | 
            +
                              filters.empty? ? "without" : "with")
         | 
| 627 | 
            +
                    }
         | 
| 579 628 | 
             
                    start = Time.now
         | 
| 580 629 | 
             
                    n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
         | 
| 581 630 | 
             
                    jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
         | 
| @@ -603,7 +652,7 @@ module Bio | |
| 603 652 | 
             
                      n_completed += 1
         | 
| 604 653 | 
             
                    end
         | 
| 605 654 | 
             
                    threads.each { |t| t.join }
         | 
| 606 | 
            -
                    LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds | 
| 655 | 
            +
                    LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.",
         | 
| 607 656 | 
             
                                        to_fetch.size, n_threads, Time.now - start) }
         | 
| 608 657 | 
             
                    to_fetch
         | 
| 609 658 | 
             
                  end
         | 
| @@ -676,30 +725,55 @@ module Bio | |
| 676 725 | 
             
                     || gi.include?(i_start)
         | 
| 677 726 | 
             
                  end
         | 
| 678 727 |  | 
| 679 | 
            -
                   | 
| 680 | 
            -
             | 
| 728 | 
            +
                  CHUNK_THRESHOLD_BYTES = 50 * 1024 * 1024
         | 
| 729 | 
            +
                  CHUNK_THRESHOLD_BLOCKS = 1000
         | 
| 730 | 
            +
             | 
| 731 | 
            +
                  def prep(file_spec, compression, ref_only)
         | 
| 732 | 
            +
                    db[FORMAT_VERSION_KEY] = FORMAT_VERSION
         | 
| 733 | 
            +
                    db[FILE_KEY] = File.basename(file_spec)
         | 
| 681 734 | 
             
                    @maf_file = db[FILE_KEY]
         | 
| 682 | 
            -
                    if  | 
| 683 | 
            -
                      db[COMPRESSION_KEY] =  | 
| 735 | 
            +
                    if compression
         | 
| 736 | 
            +
                      db[COMPRESSION_KEY] = compression.to_s
         | 
| 684 737 | 
             
                    end
         | 
| 685 | 
            -
                    first_block = parser.parse_block
         | 
| 686 | 
            -
                    self.ref_seq = first_block.sequences.first.source
         | 
| 687 738 | 
             
                    @ref_only = ref_only
         | 
| 688 | 
            -
                     | 
| 689 | 
            -
             | 
| 690 | 
            -
                     | 
| 691 | 
            -
             | 
| 739 | 
            +
                    @seen_first = false
         | 
| 740 | 
            +
                  end
         | 
| 741 | 
            +
                    
         | 
| 742 | 
            +
                  def build(parser, ref_only=true)
         | 
| 743 | 
            +
                    prep(parser.file_spec,
         | 
| 744 | 
            +
                         parser.compression,
         | 
| 745 | 
            +
                         ref_only)
         | 
| 746 | 
            +
             | 
| 692 747 | 
             
                    n = 0
         | 
| 693 | 
            -
                     | 
| 694 | 
            -
             | 
| 695 | 
            -
             | 
| 748 | 
            +
                    acc = []
         | 
| 749 | 
            +
                    acc_bytes = 0
         | 
| 750 | 
            +
                    parser.each_block do |block|
         | 
| 751 | 
            +
                      acc << block
         | 
| 752 | 
            +
                      acc_bytes += block.size
         | 
| 753 | 
            +
                      if acc_bytes > CHUNK_THRESHOLD_BYTES \
         | 
| 754 | 
            +
                        || acc.size > CHUNK_THRESHOLD_BLOCKS
         | 
| 755 | 
            +
                        index_blocks(acc)
         | 
| 756 | 
            +
                        acc = []
         | 
| 757 | 
            +
                        acc_bytes = 0
         | 
| 758 | 
            +
                      end
         | 
| 759 | 
            +
                      n += 1
         | 
| 696 760 | 
             
                    end
         | 
| 761 | 
            +
                    index_blocks(acc)
         | 
| 697 762 | 
             
                    LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
         | 
| 698 763 | 
             
                    db.synchronize(true)
         | 
| 699 764 | 
             
                  end
         | 
| 700 765 |  | 
| 701 766 | 
             
                  def index_blocks(blocks)
         | 
| 702 | 
            -
                    h =  | 
| 767 | 
            +
                    h = @mutex.synchronize do
         | 
| 768 | 
            +
                      if ! @seen_first
         | 
| 769 | 
            +
                        # set the reference sequence from the first block
         | 
| 770 | 
            +
                        first_block = blocks.first
         | 
| 771 | 
            +
                        self.ref_seq = first_block.sequences.first.source
         | 
| 772 | 
            +
                        db[REF_SEQ_KEY] = ref_seq
         | 
| 773 | 
            +
                        @seen_first = true
         | 
| 774 | 
            +
                      end
         | 
| 775 | 
            +
                      blocks.map { |b| entries_for(b) }.reduce(:merge!)
         | 
| 776 | 
            +
                    end
         | 
| 703 777 | 
             
                    db.set_bulk(h, false)
         | 
| 704 778 | 
             
                  end
         | 
| 705 779 |  | 
| @@ -719,8 +793,11 @@ module Bio | |
| 719 793 | 
             
                    if ! sid
         | 
| 720 794 | 
             
                      @max_sid += 1
         | 
| 721 795 | 
             
                      sid = @max_sid
         | 
| 722 | 
            -
                       | 
| 723 | 
            -
                       | 
| 796 | 
            +
                      # "" << foo is hideous but apparently what it takes to get a
         | 
| 797 | 
            +
                      # non-shared copy of a string on JRuby...
         | 
| 798 | 
            +
                      name_copy = "" << name
         | 
| 799 | 
            +
                      db.set("sequence:#{name_copy}", sid.to_s)
         | 
| 800 | 
            +
                      index_sequences[name_copy] = sid
         | 
| 724 801 | 
             
                    end
         | 
| 725 802 | 
             
                    return sid
         | 
| 726 803 | 
             
                  end
         | 
| @@ -739,22 +816,24 @@ module Bio | |
| 739 816 | 
             
                    # example: otoGar1.scaffold_104707.1-93001
         | 
| 740 817 | 
             
                    parts = seq.split('.', 2)
         | 
| 741 818 | 
             
                    if parts.size == 2
         | 
| 742 | 
            -
                       | 
| 743 | 
            -
                       | 
| 744 | 
            -
             | 
| 745 | 
            -
                      else
         | 
| 746 | 
            -
                        species_id = @species_max_id + 1
         | 
| 747 | 
            -
                        if species_id >= MAX_SPECIES
         | 
| 748 | 
            -
                          raise "cannot index MAF file with more than #{MAX_SPECIES} species"
         | 
| 749 | 
            -
                        end
         | 
| 750 | 
            -
                        species[species_name] = species_id
         | 
| 751 | 
            -
                        db["species:#{species_name}"] = species_id
         | 
| 752 | 
            -
                        @species_max_id = species_id
         | 
| 753 | 
            -
                        return species_id
         | 
| 754 | 
            -
                      end
         | 
| 819 | 
            +
                      # "" << foo is hideous but apparently what it takes to get a
         | 
| 820 | 
            +
                      # non-shared copy of a string on JRuby...
         | 
| 821 | 
            +
                      species_name = "" << parts[0]
         | 
| 755 822 | 
             
                    else
         | 
| 756 823 | 
             
                      # not in species.sequence format, apparently
         | 
| 757 | 
            -
                       | 
| 824 | 
            +
                      species_name = "" << seq
         | 
| 825 | 
            +
                    end
         | 
| 826 | 
            +
                    if species.has_key? species_name
         | 
| 827 | 
            +
                      return species[species_name]
         | 
| 828 | 
            +
                    else
         | 
| 829 | 
            +
                      species_id = @species_max_id + 1
         | 
| 830 | 
            +
                      if species_id >= MAX_SPECIES
         | 
| 831 | 
            +
                        raise "cannot index MAF file with more than #{MAX_SPECIES} species"
         | 
| 832 | 
            +
                      end
         | 
| 833 | 
            +
                      species[species_name] = species_id
         | 
| 834 | 
            +
                      db["species:#{species_name}"] = species_id
         | 
| 835 | 
            +
                      @species_max_id = species_id
         | 
| 836 | 
            +
                      return species_id
         | 
| 758 837 | 
             
                    end
         | 
| 759 838 | 
             
                  end
         | 
| 760 839 |  | 
| @@ -769,20 +848,27 @@ module Bio | |
| 769 848 | 
             
                  end
         | 
| 770 849 |  | 
| 771 850 | 
             
                  def entries_for(block)
         | 
| 772 | 
            -
                     | 
| 773 | 
            -
                       | 
| 774 | 
            -
             | 
| 775 | 
            -
             | 
| 776 | 
            -
             | 
| 777 | 
            -
             | 
| 778 | 
            -
             | 
| 779 | 
            -
                       | 
| 780 | 
            -
             | 
| 781 | 
            -
             | 
| 782 | 
            -
             | 
| 783 | 
            -
             | 
| 851 | 
            +
                    begin
         | 
| 852 | 
            +
                      unless block.ref_seq.source == @ref_seq
         | 
| 853 | 
            +
                        raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
         | 
| 854 | 
            +
                      end
         | 
| 855 | 
            +
                      h = {}
         | 
| 856 | 
            +
                      val = build_block_value(block)
         | 
| 857 | 
            +
                      to_index = ref_only ? [block.sequences.first] : block.sequences
         | 
| 858 | 
            +
                      to_index.each do |seq|
         | 
| 859 | 
            +
                        seq_id = seq_id_for(seq.source)
         | 
| 860 | 
            +
                        # size 0 occurs in e.g. upstream1000.maf.gz
         | 
| 861 | 
            +
                        next if seq.size == 0
         | 
| 862 | 
            +
                        seq_end = seq.start + seq.size
         | 
| 863 | 
            +
                        bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
         | 
| 864 | 
            +
                        key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
         | 
| 865 | 
            +
                        h[key] = val
         | 
| 866 | 
            +
                      end
         | 
| 867 | 
            +
                      return h
         | 
| 868 | 
            +
                    rescue Exception => e
         | 
| 869 | 
            +
                      LOG.error "Failed to index block at offset #{block.offset}:\n#{block}"
         | 
| 870 | 
            +
                      raise e
         | 
| 784 871 | 
             
                    end
         | 
| 785 | 
            -
                    return h
         | 
| 786 872 | 
             
                  end
         | 
| 787 873 | 
             
                end # class KyotoIndex
         | 
| 788 874 |  | 
| @@ -861,6 +947,10 @@ module Bio | |
| 861 947 | 
             
                    @l = l
         | 
| 862 948 | 
             
                  end
         | 
| 863 949 |  | 
| 950 | 
            +
                  def empty?
         | 
| 951 | 
            +
                    @l.empty?
         | 
| 952 | 
            +
                  end
         | 
| 953 | 
            +
             | 
| 864 954 | 
             
                  def match(entry)
         | 
| 865 955 | 
             
                    return ! @l.find { |f| ! f.call(entry) }
         | 
| 866 956 | 
             
                  end
         |