bio-maf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
 - data/.simplecov +1 -0
 - data/.travis.yml +16 -0
 - data/.yardopts +3 -0
 - data/DEVELOPMENT.md +40 -0
 - data/Gemfile +23 -0
 - data/LICENSE.txt +20 -0
 - data/README.md +209 -0
 - data/Rakefile +76 -0
 - data/VERSION +1 -0
 - data/benchmarks/dispatch_bench +53 -0
 - data/benchmarks/iter_bench +44 -0
 - data/benchmarks/read_bench +40 -0
 - data/benchmarks/sort_bench +33 -0
 - data/benchmarks/split_bench +33 -0
 - data/bin/maf_count +82 -0
 - data/bin/maf_dump_blocks +27 -0
 - data/bin/maf_extract_ranges_count +44 -0
 - data/bin/maf_index +88 -0
 - data/bin/maf_parse_bench +94 -0
 - data/bin/maf_to_fasta +68 -0
 - data/bin/maf_write +84 -0
 - data/bin/random_ranges +35 -0
 - data/features/maf-indexing.feature +31 -0
 - data/features/maf-output.feature +29 -0
 - data/features/maf-parsing.feature +44 -0
 - data/features/maf-querying.feature +75 -0
 - data/features/maf-to-fasta.feature +50 -0
 - data/features/step_definitions/convert_steps.rb +45 -0
 - data/features/step_definitions/index_steps.rb +20 -0
 - data/features/step_definitions/output_steps.rb +27 -0
 - data/features/step_definitions/parse_steps.rb +63 -0
 - data/features/step_definitions/query_steps.rb +31 -0
 - data/features/step_definitions/ucsc_bin_steps.rb +14 -0
 - data/features/support/env.rb +16 -0
 - data/features/ucsc-bins.feature +24 -0
 - data/lib/bio/maf/index.rb +620 -0
 - data/lib/bio/maf/parser.rb +888 -0
 - data/lib/bio/maf/struct.rb +63 -0
 - data/lib/bio/maf/writer.rb +63 -0
 - data/lib/bio/maf.rb +4 -0
 - data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
 - data/lib/bio/ucsc/ucsc_bin.rb +117 -0
 - data/lib/bio/ucsc.rb +2 -0
 - data/lib/bio-maf/maf.rb +3 -0
 - data/lib/bio-maf.rb +12 -0
 - data/man/.gitignore +1 -0
 - data/man/maf_index.1 +105 -0
 - data/man/maf_index.1.markdown +97 -0
 - data/man/maf_index.1.ronn +83 -0
 - data/man/maf_to_fasta.1 +53 -0
 - data/man/maf_to_fasta.1.ronn +51 -0
 - data/spec/bio/maf/index_spec.rb +363 -0
 - data/spec/bio/maf/parser_spec.rb +354 -0
 - data/spec/bio/maf/struct_spec.rb +75 -0
 - data/spec/spec_helper.rb +14 -0
 - data/test/data/big-block.maf +15999 -0
 - data/test/data/chr22_ieq.maf +11 -0
 - data/test/data/chrY-1block.maf +6 -0
 - data/test/data/empty +0 -0
 - data/test/data/empty.db +0 -0
 - data/test/data/mm8_chr7_tiny.kct +0 -0
 - data/test/data/mm8_chr7_tiny.maf +76 -0
 - data/test/data/mm8_mod_a.maf +7 -0
 - data/test/data/mm8_single.maf +13 -0
 - data/test/data/mm8_subset_a.maf +23 -0
 - data/test/data/t1-bad1.maf +15 -0
 - data/test/data/t1.fasta +12 -0
 - data/test/data/t1.maf +15 -0
 - data/test/data/t1a.maf +17 -0
 - data/test/helper.rb +18 -0
 - data/test/test_bio-maf.rb +7 -0
 - data/travis-ci/install_kc +13 -0
 - data/travis-ci/install_kc_java +13 -0
 - data/travis-ci/report_errors +4 -0
 - metadata +181 -0
 
| 
         @@ -0,0 +1,620 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'kyotocabinet'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'jruby/profiler' if RUBY_PLATFORM == 'java'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            #require 'bio-ucsc-api'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'bio-genomic-interval'
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            module Bio
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              module MAF
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                # Binary record packing and unpacking.
         
     | 
| 
      
 12 
     | 
    
         
            +
                # @api private
         
     | 
| 
      
 13 
     | 
    
         
            +
                module KVHelpers
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  KEY = Struct.new([[:marker,    :uint8],
         
     | 
| 
      
 16 
     | 
    
         
            +
                                    [:seq_id,    :uint8],
         
     | 
| 
      
 17 
     | 
    
         
            +
                                    [:bin,       :uint16],
         
     | 
| 
      
 18 
     | 
    
         
            +
                                    [:seq_start, :uint32],
         
     | 
| 
      
 19 
     | 
    
         
            +
                                    [:seq_end,   :uint32]])
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                  VAL = Struct.new([[:offset,      :uint64],
         
     | 
| 
      
 22 
     | 
    
         
            +
                                    [:length,      :uint32],
         
     | 
| 
      
 23 
     | 
    
         
            +
                                    [:text_size,   :uint32],
         
     | 
| 
      
 24 
     | 
    
         
            +
                                    [:n_seq,       :uint8],
         
     | 
| 
      
 25 
     | 
    
         
            +
                                    [:species_vec, :uint64]])
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                  KEY_FMT = KEY.fmt
         
     | 
| 
      
 28 
     | 
    
         
            +
                  KEY_SCAN_FMT = KEY.extractor_fmt(:seq_id, :bin, :seq_start, :seq_end)
         
     | 
| 
      
 29 
     | 
    
         
            +
                  CHROM_BIN_PREFIX_FMT = KEY.extractor_fmt(:marker, :seq_id, :bin)
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  VAL_FMT = VAL.fmt
         
     | 
| 
      
 32 
     | 
    
         
            +
                  VAL_IDX_OFFSET_FMT = VAL.extractor_fmt(:offset, :length)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  VAL_TEXT_SIZE_FMT = VAL.extractor_fmt(:text_size)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  VAL_N_SEQ_FMT = VAL.extractor_fmt(:n_seq)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  VAL_SPECIES_FMT = VAL.extractor_fmt(:species_vec)
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                  module_function
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                  def extract_species_vec(entry)
         
     | 
| 
      
 40 
     | 
    
         
            +
                    entry[1].unpack(VAL_SPECIES_FMT)[0]
         
     | 
| 
      
 41 
     | 
    
         
            +
                  end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                  def extract_n_sequences(entry)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    entry[1].unpack(VAL_N_SEQ_FMT)[0]
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  def extract_index_offset(entry)
         
     | 
| 
      
 48 
     | 
    
         
            +
                    entry[1].unpack(VAL_IDX_OFFSET_FMT)
         
     | 
| 
      
 49 
     | 
    
         
            +
                  end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                  def extract_text_size(entry)
         
     | 
| 
      
 52 
     | 
    
         
            +
                    entry[1].unpack(VAL_TEXT_SIZE_FMT)[0]
         
     | 
| 
      
 53 
     | 
    
         
            +
                  end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                  def unpack_key(ks)
         
     | 
| 
      
 56 
     | 
    
         
            +
                    ks.unpack(KEY_FMT)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                  def bin_start_prefix(chrom_id, bin)
         
     | 
| 
      
 60 
     | 
    
         
            +
                    [0xFF, chrom_id, bin].pack(CHROM_BIN_PREFIX_FMT)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                end
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
                class KyotoIndex
         
     | 
| 
      
 65 
     | 
    
         
            +
                  include KVHelpers
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                  attr_reader :db, :species, :species_max_id
         
     | 
| 
      
 68 
     | 
    
         
            +
                  attr_accessor :index_sequences
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
         
     | 
| 
      
 71 
     | 
    
         
            +
                  FORMAT_VERSION = 2
         
     | 
| 
      
 72 
     | 
    
         
            +
                  MAX_SPECIES = 64
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                  ## Key-value store index format
         
     | 
| 
      
 75 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 76 
     | 
    
         
            +
                  ## This format is designed for Kyoto Cabinet but should work on
         
     | 
| 
      
 77 
     | 
    
         
            +
                  ## other key-value databases allowing binary data.
         
     | 
| 
      
 78 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 79 
     | 
    
         
            +
                  ## Index metadata is stored as ASCII text, but index data is
         
     | 
| 
      
 80 
     | 
    
         
            +
                  ## stored as packed binary values.
         
     | 
| 
      
 81 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 82 
     | 
    
         
            +
                  ## Index metadata:
         
     | 
| 
      
 83 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 84 
     | 
    
         
            +
                  ##   Sequence IDs:
         
     | 
| 
      
 85 
     | 
    
         
            +
                  ##     sequence:<name> => <id>
         
     | 
| 
      
 86 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 87 
     | 
    
         
            +
                  ##     Each indexed sequence has a corresponding entry of this
         
     | 
| 
      
 88 
     | 
    
         
            +
                  ##     kind. The <name> parameter is the sequence or chromosome
         
     | 
| 
      
 89 
     | 
    
         
            +
                  ##     name as found in the MAF file, e.g. mm8.chr7. The <id>
         
     | 
| 
      
 90 
     | 
    
         
            +
                  ##     parameter is assigned when the sequence is indexed, and
         
     | 
| 
      
 91 
     | 
    
         
            +
                  ##     can be from 0 to 255.
         
     | 
| 
      
 92 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 93 
     | 
    
         
            +
                  ##   Species IDs:
         
     | 
| 
      
 94 
     | 
    
         
            +
                  ##     species:<name> => <id>
         
     | 
| 
      
 95 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 96 
     | 
    
         
            +
                  ##     Each indexed species has a corresponding entry of this
         
     | 
| 
      
 97 
     | 
    
         
            +
                  ##     kind. The <name> parameter is the species part of the
         
     | 
| 
      
 98 
     | 
    
         
            +
                  ##     sequence name as found in the MAF file, e.g. 'mm8' for
         
     | 
| 
      
 99 
     | 
    
         
            +
                  ##     'mm8.chr7'. The <id> parameter is assigned when the
         
     | 
| 
      
 100 
     | 
    
         
            +
                  ##     species is indexed, and can be from 0 to 255.
         
     | 
| 
      
 101 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 102 
     | 
    
         
            +
                  ## Index data:
         
     | 
| 
      
 103 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 104 
     | 
    
         
            +
                  ##   For each sequence upon which an index is built, one index
         
     | 
| 
      
 105 
     | 
    
         
            +
                  ##   entry is generated per MAF alignment block. The key
         
     | 
| 
      
 106 
     | 
    
         
            +
                  ##   identifies the sequence, the UCSC index bin, and the
         
     | 
| 
      
 107 
     | 
    
         
            +
                  ##   zero-based start and end positions of the sequence. The
         
     | 
| 
      
 108 
     | 
    
         
            +
                  ##   value gives the offset and size of the alignment block
         
     | 
| 
      
 109 
     | 
    
         
            +
                  ##   within the MAF file.
         
     | 
| 
      
 110 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 111 
     | 
    
         
            +
                  ##   All values are stored as big-endian, unsigned packed binary
         
     | 
| 
      
 112 
     | 
    
         
            +
                  ##   data.
         
     | 
| 
      
 113 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 114 
     | 
    
         
            +
                  ## Keys: (12 bytes) [CCS>L>L>]
         
     | 
| 
      
 115 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 116 
     | 
    
         
            +
                  ##   0xFF (1 byte):
         
     | 
| 
      
 117 
     | 
    
         
            +
                  ##      index entry prefix
         
     | 
| 
      
 118 
     | 
    
         
            +
                  ##   Sequence chromosome ID (1 byte):
         
     | 
| 
      
 119 
     | 
    
         
            +
                  ##      corresponds to sequence:<name> entries
         
     | 
| 
      
 120 
     | 
    
         
            +
                  ##   UCSC bin (16 bits)
         
     | 
| 
      
 121 
     | 
    
         
            +
                  ##   Sequence start, zero-based, inclusive (32 bits)
         
     | 
| 
      
 122 
     | 
    
         
            +
                  ##   Sequence end, zero-based, exclusive (32 bits)
         
     | 
| 
      
 123 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 124 
     | 
    
         
            +
                  ## Values (25 bytes) [Q>L>L>CQ>]
         
     | 
| 
      
 125 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 126 
     | 
    
         
            +
                  ##   MAF file offset (64 bits)
         
     | 
| 
      
 127 
     | 
    
         
            +
                  ##   MAF alignment block length (32 bits)
         
     | 
| 
      
 128 
     | 
    
         
            +
                  ##   Block text size (32 bits)
         
     | 
| 
      
 129 
     | 
    
         
            +
                  ##   Number of sequences in block (8 bits)
         
     | 
| 
      
 130 
     | 
    
         
            +
                  ##   Species bit vector (64 bits)
         
     | 
| 
      
 131 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 132 
     | 
    
         
            +
                  ## Example:
         
     | 
| 
      
 133 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 134 
     | 
    
         
            +
                  ##  For a block with sequence 0, bin 1195, start 80082334, end
         
     | 
| 
      
 135 
     | 
    
         
            +
                  ##       80082368, MAF offset 16, and MAF block length 1087:
         
     | 
| 
      
 136 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 137 
     | 
    
         
            +
                  ##     |  |id| bin | seq_start | seq_end   |
         
     | 
| 
      
 138 
     | 
    
         
            +
                  ## key: FF 00 04 AB 04 C5 F5 9E 04 C5 F5 C0
         
     | 
| 
      
 139 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 140 
     | 
    
         
            +
                  ##     |         offset        |  length   |   ts   |ns|  species_vec  |
         
     | 
| 
      
 141 
     | 
    
         
            +
                  ## val: 00 00 00 00 00 00 00 10 00 00 04 3F  [TODO]
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
                  #### Public API
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                  # Open an existing index for reading.
         
     | 
| 
      
 146 
     | 
    
         
            +
                  # @param [String] path path to existing Kyoto Cabinet index
         
     | 
| 
      
 147 
     | 
    
         
            +
                  # @return [KyotoIndex]
         
     | 
| 
      
 148 
     | 
    
         
            +
                  def self.open(path)
         
     | 
| 
      
 149 
     | 
    
         
            +
                    return KyotoIndex.new(path)
         
     | 
| 
      
 150 
     | 
    
         
            +
                  end
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                  # Build a new index from the MAF file being parsed by `parser`,
         
     | 
| 
      
 153 
     | 
    
         
            +
                  # and store it in `path`.
         
     | 
| 
      
 154 
     | 
    
         
            +
                  # @param [Parser] parser MAF parser for file to index
         
     | 
| 
      
 155 
     | 
    
         
            +
                  # @param [String] path path to index file to create
         
     | 
| 
      
 156 
     | 
    
         
            +
                  # @return [KyotoIndex]
         
     | 
| 
      
 157 
     | 
    
         
            +
                  def self.build(parser, path)
         
     | 
| 
      
 158 
     | 
    
         
            +
                    idx = self.new(path)
         
     | 
| 
      
 159 
     | 
    
         
            +
                    idx.build_default(parser)
         
     | 
| 
      
 160 
     | 
    
         
            +
                    return idx
         
     | 
| 
      
 161 
     | 
    
         
            +
                  end
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
                  # Find all alignment blocks in the genomic regions in the list
         
     | 
| 
      
 164 
     | 
    
         
            +
                  # of Bio::GenomicInterval objects, and parse them with the given
         
     | 
| 
      
 165 
     | 
    
         
            +
                  # parser.
         
     | 
| 
      
 166 
     | 
    
         
            +
                  #
         
     | 
| 
      
 167 
     | 
    
         
            +
                  # An optional Hash of filters may be passed in. The following
         
     | 
| 
      
 168 
     | 
    
         
            +
                  # keys are used:
         
     | 
| 
      
 169 
     | 
    
         
            +
                  #
         
     | 
| 
      
 170 
     | 
    
         
            +
                  #  * `:with_all_species => ["sp1", "sp2", ...]`
         
     | 
| 
      
 171 
     | 
    
         
            +
                  #
         
     | 
| 
      
 172 
     | 
    
         
            +
                  #      Only match alignment blocks containing all given species.
         
     | 
| 
      
 173 
     | 
    
         
            +
                  #
         
     | 
| 
      
 174 
     | 
    
         
            +
                  #  * `:at_least_n_sequences => n`
         
     | 
| 
      
 175 
     | 
    
         
            +
                  #
         
     | 
| 
      
 176 
     | 
    
         
            +
                  #      Only match alignment blocks with at least N sequences.
         
     | 
| 
      
 177 
     | 
    
         
            +
                  #
         
     | 
| 
      
 178 
     | 
    
         
            +
                  #  * `:min_size => n`
         
     | 
| 
      
 179 
     | 
    
         
            +
                  #
         
     | 
| 
      
 180 
     | 
    
         
            +
                  #      Only match alignment blocks with text size at least N.
         
     | 
| 
      
 181 
     | 
    
         
            +
                  #
         
     | 
| 
      
 182 
     | 
    
         
            +
                  #  * `:max_size => n`
         
     | 
| 
      
 183 
     | 
    
         
            +
                  #
         
     | 
| 
      
 184 
     | 
    
         
            +
                  #      Only match alignment blocks with text size at most N.
         
     | 
| 
      
 185 
     | 
    
         
            +
                  #
         
     | 
| 
      
 186 
     | 
    
         
            +
                  # @param [Enumerable<Bio::GenomicInterval>] intervals genomic
         
     | 
| 
      
 187 
     | 
    
         
            +
                  #  intervals to parse.
         
     | 
| 
      
 188 
     | 
    
         
            +
                  # @param [Parser] parser MAF parser for file to fetch blocks
         
     | 
| 
      
 189 
     | 
    
         
            +
                  #  from.
         
     | 
| 
      
 190 
     | 
    
         
            +
                  # @param [Hash] filter Block filter expression.
         
     | 
| 
      
 191 
     | 
    
         
            +
                  # @return [Array<Block>]
         
     | 
| 
      
 192 
     | 
    
         
            +
                  # @api public
         
     | 
| 
      
 193 
     | 
    
         
            +
                  def find(intervals, parser, filter={})
         
     | 
| 
      
 194 
     | 
    
         
            +
                    start = Time.now
         
     | 
| 
      
 195 
     | 
    
         
            +
                    fl = fetch_list(intervals, filter)
         
     | 
| 
      
 196 
     | 
    
         
            +
                    $stderr.printf("Built fetch list of %d items in %.3fs.\n",
         
     | 
| 
      
 197 
     | 
    
         
            +
                                   fl.size,
         
     | 
| 
      
 198 
     | 
    
         
            +
                                   Time.now - start)
         
     | 
| 
      
 199 
     | 
    
         
            +
                    parser.fetch_blocks(fl)
         
     | 
| 
      
 200 
     | 
    
         
            +
                  end
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
                  # Close the underlying Kyoto Cabinet database handle.
         
     | 
| 
      
 203 
     | 
    
         
            +
                  def close
         
     | 
| 
      
 204 
     | 
    
         
            +
                    db.close
         
     | 
| 
      
 205 
     | 
    
         
            +
                  end
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
                  #### KyotoIndex Internals
         
     | 
| 
      
 208 
     | 
    
         
            +
                  # @api private
         
     | 
| 
      
 209 
     | 
    
         
            +
             
     | 
| 
      
 210 
     | 
    
         
            +
                  def initialize(path, db_arg=nil)
         
     | 
| 
      
 211 
     | 
    
         
            +
                    @species = {}
         
     | 
| 
      
 212 
     | 
    
         
            +
                    @species_max_id = -1
         
     | 
| 
      
 213 
     | 
    
         
            +
                    if db_arg || ((path.size > 1) and File.exist?(path))
         
     | 
| 
      
 214 
     | 
    
         
            +
                      mode = KyotoCabinet::DB::OREADER
         
     | 
| 
      
 215 
     | 
    
         
            +
                    else
         
     | 
| 
      
 216 
     | 
    
         
            +
                      mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
         
     | 
| 
      
 217 
     | 
    
         
            +
                    end
         
     | 
| 
      
 218 
     | 
    
         
            +
                    @db = db_arg || KyotoCabinet::DB.new
         
     | 
| 
      
 219 
     | 
    
         
            +
                    @path = path
         
     | 
| 
      
 220 
     | 
    
         
            +
                    unless db_arg || db.open(path.to_s, mode)
         
     | 
| 
      
 221 
     | 
    
         
            +
                      raise "Could not open DB file!"
         
     | 
| 
      
 222 
     | 
    
         
            +
                    end
         
     | 
| 
      
 223 
     | 
    
         
            +
                    if mode == KyotoCabinet::DB::OREADER
         
     | 
| 
      
 224 
     | 
    
         
            +
                      load_index_sequences
         
     | 
| 
      
 225 
     | 
    
         
            +
                      load_species
         
     | 
| 
      
 226 
     | 
    
         
            +
                    end
         
     | 
| 
      
 227 
     | 
    
         
            +
                  end
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
                  # Reopen the same DB handle read-only. Only useful for unit tests.
         
     | 
| 
      
 230 
     | 
    
         
            +
                  def reopen
         
     | 
| 
      
 231 
     | 
    
         
            +
                    KyotoIndex.new(@path, @db)
         
     | 
| 
      
 232 
     | 
    
         
            +
                  end
         
     | 
| 
      
 233 
     | 
    
         
            +
             
     | 
| 
      
 234 
     | 
    
         
            +
                  def dump(stream=$stdout)
         
     | 
| 
      
 235 
     | 
    
         
            +
                    stream.puts "KyotoIndex dump: #{@path}"
         
     | 
| 
      
 236 
     | 
    
         
            +
                    stream.puts
         
     | 
| 
      
 237 
     | 
    
         
            +
                    if db.count == 0
         
     | 
| 
      
 238 
     | 
    
         
            +
                      stream.puts "Empty database!"
         
     | 
| 
      
 239 
     | 
    
         
            +
                      return
         
     | 
| 
      
 240 
     | 
    
         
            +
                    end
         
     | 
| 
      
 241 
     | 
    
         
            +
                    db.cursor_process do |cur|
         
     | 
| 
      
 242 
     | 
    
         
            +
                      stream.puts "== Metadata =="
         
     | 
| 
      
 243 
     | 
    
         
            +
                      cur.jump('')
         
     | 
| 
      
 244 
     | 
    
         
            +
                      while true
         
     | 
| 
      
 245 
     | 
    
         
            +
                        k, v = cur.get(false)
         
     | 
| 
      
 246 
     | 
    
         
            +
                        raise "unexpected end of records!" unless k
         
     | 
| 
      
 247 
     | 
    
         
            +
                        break if k[0] == "\xff"
         
     | 
| 
      
 248 
     | 
    
         
            +
                        stream.puts "#{k}: #{v}"
         
     | 
| 
      
 249 
     | 
    
         
            +
                        unless cur.step
         
     | 
| 
      
 250 
     | 
    
         
            +
                          raise "could not advance cursor!"
         
     | 
| 
      
 251 
     | 
    
         
            +
                        end
         
     | 
| 
      
 252 
     | 
    
         
            +
                      end
         
     | 
| 
      
 253 
     | 
    
         
            +
                      stream.puts "== Index records =="
         
     | 
| 
      
 254 
     | 
    
         
            +
                      while pair = cur.get(true)
         
     | 
| 
      
 255 
     | 
    
         
            +
                        _, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
         
     | 
| 
      
 256 
     | 
    
         
            +
                        offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
         
     | 
| 
      
 257 
     | 
    
         
            +
                        stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
         
     | 
| 
      
 258 
     | 
    
         
            +
                        stream.puts "  offset #{offset}, length #{len}"
         
     | 
| 
      
 259 
     | 
    
         
            +
                        stream.puts "  text size: #{text_size}"
         
     | 
| 
      
 260 
     | 
    
         
            +
                        stream.puts "  sequences in block: #{n_seq}"
         
     | 
| 
      
 261 
     | 
    
         
            +
                        stream.printf("  species vector: %016x\n", species_vec)
         
     | 
| 
      
 262 
     | 
    
         
            +
                      end
         
     | 
| 
      
 263 
     | 
    
         
            +
                    end
         
     | 
| 
      
 264 
     | 
    
         
            +
                  end
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
                  ## Retrieval:
         
     | 
| 
      
 267 
     | 
    
         
            +
                  ##  1. merge the intervals of interest
         
     | 
| 
      
 268 
     | 
    
         
            +
                  ##  2. for each interval, compute the bins with #bin_all
         
     | 
| 
      
 269 
     | 
    
         
            +
                  ##  3. for each bin to search, make a list of intervals of
         
     | 
| 
      
 270 
     | 
    
         
            +
                  ##     interest
         
     | 
| 
      
 271 
     | 
    
         
            +
                  ##  4. compute the spanning interval for that bin
         
     | 
| 
      
 272 
     | 
    
         
            +
                  ##  5. start at the beginning of the bin
         
     | 
| 
      
 273 
     | 
    
         
            +
                  ##  6. if a record intersects the spanning interval: 
         
     | 
| 
      
 274 
     | 
    
         
            +
                  ##    A. #find an interval it intersects
         
     | 
| 
      
 275 
     | 
    
         
            +
                  ##    B. if found, add to the fetch list
         
     | 
| 
      
 276 
     | 
    
         
            +
                  ##  7. if a record starts past the end of the spanning interval,
         
     | 
| 
      
 277 
     | 
    
         
            +
                  ##     we are done scanning this bin.
         
     | 
| 
      
 278 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 279 
     | 
    
         
            +
                  ## Optimizations:
         
     | 
| 
      
 280 
     | 
    
         
            +
                  ##  * once we reach the start of the spanning interval,
         
     | 
| 
      
 281 
     | 
    
         
            +
                  ##    all records start in it until we see a record starting
         
     | 
| 
      
 282 
     | 
    
         
            +
                  ##    past it.
         
     | 
| 
      
 283 
     | 
    
         
            +
                  ##  * as record starts pass the start of intervals of interest,
         
     | 
| 
      
 284 
     | 
    
         
            +
                  ##    pull those intervals off the list
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
                  # Build a fetch list of alignment blocks to read, given an array
         
     | 
| 
      
 287 
     | 
    
         
            +
                  # of Bio::GenomicInterval objects
         
     | 
| 
      
 288 
     | 
    
         
            +
                  def fetch_list(intervals, filter_spec={})
         
     | 
| 
      
 289 
     | 
    
         
            +
                    start = Time.now
         
     | 
| 
      
 290 
     | 
    
         
            +
                    filter_spec ||= {}
         
     | 
| 
      
 291 
     | 
    
         
            +
                    filters = Filters.build(filter_spec, self)
         
     | 
| 
      
 292 
     | 
    
         
            +
                    chrom = intervals.first.chrom
         
     | 
| 
      
 293 
     | 
    
         
            +
                    chrom_id = index_sequences[chrom]
         
     | 
| 
      
 294 
     | 
    
         
            +
                    unless chrom_id
         
     | 
| 
      
 295 
     | 
    
         
            +
                      raise "chromosome #{chrom} not indexed!"
         
     | 
| 
      
 296 
     | 
    
         
            +
                    end
         
     | 
| 
      
 297 
     | 
    
         
            +
                    if intervals.find { |i| i.chrom != chrom }
         
     | 
| 
      
 298 
     | 
    
         
            +
                      raise "all intervals must be for the same chromosome!"
         
     | 
| 
      
 299 
     | 
    
         
            +
                    end
         
     | 
| 
      
 300 
     | 
    
         
            +
                    # for each bin, build a list of the intervals to look for there
         
     | 
| 
      
 301 
     | 
    
         
            +
                    bin_intervals = Hash.new { |h, k| h[k] = [] }
         
     | 
| 
      
 302 
     | 
    
         
            +
                    intervals.each do |i|
         
     | 
| 
      
 303 
     | 
    
         
            +
                      i.bin_all.each do |bin|
         
     | 
| 
      
 304 
     | 
    
         
            +
                        bin_intervals[bin] << (i.zero_start...i.zero_end)
         
     | 
| 
      
 305 
     | 
    
         
            +
                      end
         
     | 
| 
      
 306 
     | 
    
         
            +
                    end
         
     | 
| 
      
 307 
     | 
    
         
            +
                    bin_intervals.values.each do |intervals|
         
     | 
| 
      
 308 
     | 
    
         
            +
                      intervals.sort_by! {|i| i.begin}
         
     | 
| 
      
 309 
     | 
    
         
            +
                    end
         
     | 
| 
      
 310 
     | 
    
         
            +
                    ready = Time.now
         
     | 
| 
      
 311 
     | 
    
         
            +
                    $stderr.puts "bin intervals computed after #{ready - start} seconds."
         
     | 
| 
      
 312 
     | 
    
         
            +
                    if RUBY_PLATFORM == 'java'
         
     | 
| 
      
 313 
     | 
    
         
            +
                      scan_bins_parallel(chrom_id, bin_intervals, filters)
         
     | 
| 
      
 314 
     | 
    
         
            +
                    else
         
     | 
| 
      
 315 
     | 
    
         
            +
                      scan_bins(chrom_id, bin_intervals, filters)
         
     | 
| 
      
 316 
     | 
    
         
            +
                    end
         
     | 
| 
      
 317 
     | 
    
         
            +
                  end # #fetch_list
         
     | 
| 
      
 318 
     | 
    
         
            +
             
     | 
| 
      
 319 
     | 
    
         
            +
                  # Scan the index for blocks matching the given bins and intervals.
         
     | 
| 
      
 320 
     | 
    
         
            +
                  def scan_bins(chrom_id, bin_intervals, filters)
         
     | 
| 
      
 321 
     | 
    
         
            +
                    to_fetch = []
         
     | 
| 
      
 322 
     | 
    
         
            +
                    db.cursor_process do |cur|
         
     | 
| 
      
 323 
     | 
    
         
            +
                      bin_intervals.each do |bin, bin_intervals_raw|
         
     | 
| 
      
 324 
     | 
    
         
            +
                        matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
         
     | 
| 
      
 325 
     | 
    
         
            +
                        to_fetch.concat(matches)
         
     | 
| 
      
 326 
     | 
    
         
            +
                      end 
         
     | 
| 
      
 327 
     | 
    
         
            +
                    end
         
     | 
| 
      
 328 
     | 
    
         
            +
                    to_fetch
         
     | 
| 
      
 329 
     | 
    
         
            +
                  end
         
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
                  def with_profiling
         
     | 
| 
      
 332 
     | 
    
         
            +
                    if RUBY_PLATFORM == 'java' && ENV['profile']
         
     | 
| 
      
 333 
     | 
    
         
            +
                      rv = nil
         
     | 
| 
      
 334 
     | 
    
         
            +
                      pdata = JRuby::Profiler.profile do
         
     | 
| 
      
 335 
     | 
    
         
            +
                        rv = yield
         
     | 
| 
      
 336 
     | 
    
         
            +
                      end
         
     | 
| 
      
 337 
     | 
    
         
            +
                      printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
         
     | 
| 
      
 338 
     | 
    
         
            +
                      printer.printProfile(STDERR)
         
     | 
| 
      
 339 
     | 
    
         
            +
                      return rv
         
     | 
| 
      
 340 
     | 
    
         
            +
                    else
         
     | 
| 
      
 341 
     | 
    
         
            +
                      yield
         
     | 
| 
      
 342 
     | 
    
         
            +
                    end
         
     | 
| 
      
 343 
     | 
    
         
            +
                  end
         
     | 
| 
      
 344 
     | 
    
         
            +
             
     | 
| 
      
 345 
     | 
    
         
            +
                  def scan_bins_parallel(chrom_id, bin_intervals, filters)
         
     | 
| 
      
 346 
     | 
    
         
            +
                    start = Time.now
         
     | 
| 
      
 347 
     | 
    
         
            +
                    n_threads = ENV['profile'] ? 1 : 4
         
     | 
| 
      
 348 
     | 
    
         
            +
                    jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
         
     | 
| 
      
 349 
     | 
    
         
            +
                    completed = java.util.concurrent.LinkedBlockingQueue.new(128)
         
     | 
| 
      
 350 
     | 
    
         
            +
                    threads = []
         
     | 
| 
      
 351 
     | 
    
         
            +
                    n_threads.times do
         
     | 
| 
      
 352 
     | 
    
         
            +
                      threads << make_scan_worker(jobs, completed) do |cur, req|
         
     | 
| 
      
 353 
     | 
    
         
            +
                        bin, intervals = req
         
     | 
| 
      
 354 
     | 
    
         
            +
                        scan_bin(cur, chrom_id, bin, intervals, filters)
         
     | 
| 
      
 355 
     | 
    
         
            +
                      end
         
     | 
| 
      
 356 
     | 
    
         
            +
                    end
         
     | 
| 
      
 357 
     | 
    
         
            +
                    n_completed = 0
         
     | 
| 
      
 358 
     | 
    
         
            +
                    to_fetch = []
         
     | 
| 
      
 359 
     | 
    
         
            +
                    while (n_completed < bin_intervals.size)
         
     | 
| 
      
 360 
     | 
    
         
            +
                      c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
         
     | 
| 
      
 361 
     | 
    
         
            +
                      if c.nil?
         
     | 
| 
      
 362 
     | 
    
         
            +
                        if threads.find { |t| t.alive? }
         
     | 
| 
      
 363 
     | 
    
         
            +
                          next
         
     | 
| 
      
 364 
     | 
    
         
            +
                        else
         
     | 
| 
      
 365 
     | 
    
         
            +
                          raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
         
     | 
| 
      
 366 
     | 
    
         
            +
                        end
         
     | 
| 
      
 367 
     | 
    
         
            +
                      end
         
     | 
| 
      
 368 
     | 
    
         
            +
                      raise "worker failed: #{c}" if c.is_a? Exception
         
     | 
| 
      
 369 
     | 
    
         
            +
                      to_fetch.concat(c)
         
     | 
| 
      
 370 
     | 
    
         
            +
                      n_completed += 1
         
     | 
| 
      
 371 
     | 
    
         
            +
                    end
         
     | 
| 
      
 372 
     | 
    
         
            +
                    threads.each { |t| t.join }
         
     | 
| 
      
 373 
     | 
    
         
            +
                    $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
         
     | 
| 
      
 374 
     | 
    
         
            +
                                   to_fetch.size, n_threads, Time.now - start)
         
     | 
| 
      
 375 
     | 
    
         
            +
                    to_fetch
         
     | 
| 
      
 376 
     | 
    
         
            +
                  end
         
     | 
| 
      
 377 
     | 
    
         
            +
             
     | 
| 
      
 378 
     | 
    
         
            +
                  def make_scan_worker(jobs, completed)
         
     | 
| 
      
 379 
     | 
    
         
            +
                    Thread.new do
         
     | 
| 
      
 380 
     | 
    
         
            +
                      with_profiling do
         
     | 
| 
      
 381 
     | 
    
         
            +
                        db.cursor_process do |cur|
         
     | 
| 
      
 382 
     | 
    
         
            +
                          while true
         
     | 
| 
      
 383 
     | 
    
         
            +
                            req = jobs.poll
         
     | 
| 
      
 384 
     | 
    
         
            +
                            break unless req
         
     | 
| 
      
 385 
     | 
    
         
            +
                            begin
         
     | 
| 
      
 386 
     | 
    
         
            +
                              result = yield(cur, req)
         
     | 
| 
      
 387 
     | 
    
         
            +
                              completed.put(result)
         
     | 
| 
      
 388 
     | 
    
         
            +
                            rescue Exception => e
         
     | 
| 
      
 389 
     | 
    
         
            +
                              completed.put(e)
         
     | 
| 
      
 390 
     | 
    
         
            +
                              $stderr.puts "Worker failing: #{e.class}: #{e}"
         
     | 
| 
      
 391 
     | 
    
         
            +
                              $stderr.puts e.backtrace.join("\n")
         
     | 
| 
      
 392 
     | 
    
         
            +
                              raise e
         
     | 
| 
      
 393 
     | 
    
         
            +
                            end
         
     | 
| 
      
 394 
     | 
    
         
            +
                          end
         
     | 
| 
      
 395 
     | 
    
         
            +
                        end
         
     | 
| 
      
 396 
     | 
    
         
            +
                      end
         
     | 
| 
      
 397 
     | 
    
         
            +
                    end
         
     | 
| 
      
 398 
     | 
    
         
            +
                  end
         
     | 
| 
      
 399 
     | 
    
         
            +
             
     | 
| 
      
 400 
     | 
    
         
            +
                  def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
         
     | 
| 
      
 401 
     | 
    
         
            +
                    # bin_intervals is sorted by zero_start
         
     | 
| 
      
 402 
     | 
    
         
            +
                    # compute the start and end of all intervals of interest
         
     | 
| 
      
 403 
     | 
    
         
            +
                    spanning_start = bin_intervals.first.begin
         
     | 
| 
      
 404 
     | 
    
         
            +
                    spanning_end = bin_intervals.map {|i| i.end}.max
         
     | 
| 
      
 405 
     | 
    
         
            +
                    # scan from the start of the bin
         
     | 
| 
      
 406 
     | 
    
         
            +
                    cur.jump(bin_start_prefix(chrom_id, bin))
         
     | 
| 
      
 407 
     | 
    
         
            +
                    matches = []
         
     | 
| 
      
 408 
     | 
    
         
            +
                    while pair = cur.get(true)
         
     | 
| 
      
 409 
     | 
    
         
            +
                      c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
         
     | 
| 
      
 410 
     | 
    
         
            +
                      if (c_chr != chrom_id) \
         
     | 
| 
      
 411 
     | 
    
         
            +
                        || (c_bin != bin) \
         
     | 
| 
      
 412 
     | 
    
         
            +
                        || c_start >= spanning_end
         
     | 
| 
      
 413 
     | 
    
         
            +
                        # we've hit the next bin, or chromosome, or gone past
         
     | 
| 
      
 414 
     | 
    
         
            +
                        # the spanning interval, so we're done with this bin
         
     | 
| 
      
 415 
     | 
    
         
            +
                        break
         
     | 
| 
      
 416 
     | 
    
         
            +
                      end
         
     | 
| 
      
 417 
     | 
    
         
            +
                      if c_end >= spanning_start # possible overlap
         
     | 
| 
      
 418 
     | 
    
         
            +
                        # any intervals that end before the start of the current
         
     | 
| 
      
 419 
     | 
    
         
            +
                        # block are no longer relevant
         
     | 
| 
      
 420 
     | 
    
         
            +
                        while bin_intervals.first.end < c_start
         
     | 
| 
      
 421 
     | 
    
         
            +
                          bin_intervals.shift
         
     | 
| 
      
 422 
     | 
    
         
            +
                        end
         
     | 
| 
      
 423 
     | 
    
         
            +
                        bin_intervals.each do |i|
         
     | 
| 
      
 424 
     | 
    
         
            +
                          i_start = i.begin
         
     | 
| 
      
 425 
     | 
    
         
            +
                          break if i_start > c_end
         
     | 
| 
      
 426 
     | 
    
         
            +
                          if ((c_start <= i_start && i_start < c_end) \
         
     | 
| 
      
 427 
     | 
    
         
            +
                              || i.include?(c_start)) \
         
     | 
| 
      
 428 
     | 
    
         
            +
                              && filters.match(pair)
         
     | 
| 
      
 429 
     | 
    
         
            +
                            # match
         
     | 
| 
      
 430 
     | 
    
         
            +
                            matches << extract_index_offset(pair)
         
     | 
| 
      
 431 
     | 
    
         
            +
                            break
         
     | 
| 
      
 432 
     | 
    
         
            +
                          end
         
     | 
| 
      
 433 
     | 
    
         
            +
                        end
         
     | 
| 
      
 434 
     | 
    
         
            +
                      end
         
     | 
| 
      
 435 
     | 
    
         
            +
                    end
         
     | 
| 
      
 436 
     | 
    
         
            +
                    matches
         
     | 
| 
      
 437 
     | 
    
         
            +
                  end
         
     | 
| 
      
 438 
     | 
    
         
            +
             
     | 
| 
      
 439 
     | 
    
         
            +
                  def overlaps?(gi, i_start, i_end)
         
     | 
| 
      
 440 
     | 
    
         
            +
                    g_start = gi.begin
         
     | 
| 
      
 441 
     | 
    
         
            +
             
     | 
| 
      
 442 
     | 
    
         
            +
                    (i_start <= g_start && g_start < i_end) \
         
     | 
| 
      
 443 
     | 
    
         
            +
                     || gi.include?(i_start)
         
     | 
| 
      
 444 
     | 
    
         
            +
                  end
         
     | 
| 
      
 445 
     | 
    
         
            +
             
     | 
| 
      
 446 
     | 
    
         
            +
                  def build_default(parser)
         
     | 
| 
      
 447 
     | 
    
         
            +
                    first_block = parser.parse_block
         
     | 
| 
      
 448 
     | 
    
         
            +
                    ref_seq = first_block.sequences.first.source
         
     | 
| 
      
 449 
     | 
    
         
            +
                    db[FORMAT_VERSION_KEY] = FORMAT_VERSION
         
     | 
| 
      
 450 
     | 
    
         
            +
                    @index_sequences = { ref_seq => 0 }
         
     | 
| 
      
 451 
     | 
    
         
            +
                    store_index_sequences!
         
     | 
| 
      
 452 
     | 
    
         
            +
                    index_blocks([first_block])
         
     | 
| 
      
 453 
     | 
    
         
            +
                    parser.enum_for(:each_block).each_slice(1000).each do |blocks|
         
     | 
| 
      
 454 
     | 
    
         
            +
                      index_blocks(blocks)
         
     | 
| 
      
 455 
     | 
    
         
            +
                    end
         
     | 
| 
      
 456 
     | 
    
         
            +
                    db.synchronize(true)
         
     | 
| 
      
 457 
     | 
    
         
            +
                  end
         
     | 
| 
      
 458 
     | 
    
         
            +
             
     | 
| 
      
 459 
     | 
    
         
            +
                  def index_blocks(blocks)
         
     | 
| 
      
 460 
     | 
    
         
            +
                    h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
         
     | 
| 
      
 461 
     | 
    
         
            +
                    db.set_bulk(h, false)
         
     | 
| 
      
 462 
     | 
    
         
            +
                  end
         
     | 
| 
      
 463 
     | 
    
         
            +
             
     | 
| 
      
 464 
     | 
    
         
            +
                  def load_index_sequences
         
     | 
| 
      
 465 
     | 
    
         
            +
                    h = {}
         
     | 
| 
      
 466 
     | 
    
         
            +
                    db.match_prefix("sequence:").each do |key|
         
     | 
| 
      
 467 
     | 
    
         
            +
                      _, name = key.split(':', 2)
         
     | 
| 
      
 468 
     | 
    
         
            +
                      id = db[key].to_i
         
     | 
| 
      
 469 
     | 
    
         
            +
                      h[name] = id
         
     | 
| 
      
 470 
     | 
    
         
            +
                    end
         
     | 
| 
      
 471 
     | 
    
         
            +
                    @index_sequences = h
         
     | 
| 
      
 472 
     | 
    
         
            +
                  end
         
     | 
| 
      
 473 
     | 
    
         
            +
             
     | 
| 
      
 474 
     | 
    
         
            +
                  def store_index_sequences!
         
     | 
| 
      
 475 
     | 
    
         
            +
                    index_sequences.each do |name, id|
         
     | 
| 
      
 476 
     | 
    
         
            +
                      db.set("sequence:#{name}", id.to_s)
         
     | 
| 
      
 477 
     | 
    
         
            +
                    end
         
     | 
| 
      
 478 
     | 
    
         
            +
                  end
         
     | 
| 
      
 479 
     | 
    
         
            +
             
     | 
| 
      
 480 
     | 
    
         
            +
                  def load_species
         
     | 
| 
      
 481 
     | 
    
         
            +
                    db.match_prefix("species:").each do |key|
         
     | 
| 
      
 482 
     | 
    
         
            +
                      _, name = key.split(':', 2)
         
     | 
| 
      
 483 
     | 
    
         
            +
                      id = db[key].to_i
         
     | 
| 
      
 484 
     | 
    
         
            +
                      @species[name] = id
         
     | 
| 
      
 485 
     | 
    
         
            +
                    end
         
     | 
| 
      
 486 
     | 
    
         
            +
                    @species_max_id = @species.values.sort.last || -1
         
     | 
| 
      
 487 
     | 
    
         
            +
                  end
         
     | 
| 
      
 488 
     | 
    
         
            +
             
     | 
| 
      
 489 
     | 
    
         
            +
                  def species_id_for_seq(seq)
         
     | 
| 
      
 490 
     | 
    
         
            +
                    # NB can have multiple dots
         
     | 
| 
      
 491 
     | 
    
         
            +
                    # example: otoGar1.scaffold_104707.1-93001
         
     | 
| 
      
 492 
     | 
    
         
            +
                    parts = seq.split('.', 2)
         
     | 
| 
      
 493 
     | 
    
         
            +
                    if parts.size == 2
         
     | 
| 
      
 494 
     | 
    
         
            +
                      species_name = parts[0]
         
     | 
| 
      
 495 
     | 
    
         
            +
                      if species.has_key? species_name
         
     | 
| 
      
 496 
     | 
    
         
            +
                        return species[species_name]
         
     | 
| 
      
 497 
     | 
    
         
            +
                      else
         
     | 
| 
      
 498 
     | 
    
         
            +
                        species_id = @species_max_id + 1
         
     | 
| 
      
 499 
     | 
    
         
            +
                        if species_id >= MAX_SPECIES
         
     | 
| 
      
 500 
     | 
    
         
            +
                          raise "cannot index MAF file with more than #{MAX_SPECIES} species"
         
     | 
| 
      
 501 
     | 
    
         
            +
                        end
         
     | 
| 
      
 502 
     | 
    
         
            +
                        species[species_name] = species_id
         
     | 
| 
      
 503 
     | 
    
         
            +
                        db["species:#{species_name}"] = species_id
         
     | 
| 
      
 504 
     | 
    
         
            +
                        @species_max_id = species_id
         
     | 
| 
      
 505 
     | 
    
         
            +
                        return species_id
         
     | 
| 
      
 506 
     | 
    
         
            +
                      end
         
     | 
| 
      
 507 
     | 
    
         
            +
                    else
         
     | 
| 
      
 508 
     | 
    
         
            +
                      # not in species.sequence format, apparently
         
     | 
| 
      
 509 
     | 
    
         
            +
                      return nil
         
     | 
| 
      
 510 
     | 
    
         
            +
                    end
         
     | 
| 
      
 511 
     | 
    
         
            +
                  end
         
     | 
| 
      
 512 
     | 
    
         
            +
             
     | 
| 
      
 513 
     | 
    
         
            +
                  def build_block_value(block)
         
     | 
| 
      
 514 
     | 
    
         
            +
                    bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
         
     | 
| 
      
 515 
     | 
    
         
            +
                    vec = bits.reduce(0, :|)
         
     | 
| 
      
 516 
     | 
    
         
            +
                    return [block.offset,
         
     | 
| 
      
 517 
     | 
    
         
            +
                            block.size,
         
     | 
| 
      
 518 
     | 
    
         
            +
                            block.text_size,
         
     | 
| 
      
 519 
     | 
    
         
            +
                            block.sequences.size,
         
     | 
| 
      
 520 
     | 
    
         
            +
                            vec].pack(VAL_FMT)
         
     | 
| 
      
 521 
     | 
    
         
            +
                  end
         
     | 
| 
      
 522 
     | 
    
         
            +
             
     | 
| 
      
 523 
     | 
    
         
            +
                  def entries_for(block)
         
     | 
| 
      
 524 
     | 
    
         
            +
                    h = {}
         
     | 
| 
      
 525 
     | 
    
         
            +
                    val = build_block_value(block)
         
     | 
| 
      
 526 
     | 
    
         
            +
                    block.sequences.each do |seq|
         
     | 
| 
      
 527 
     | 
    
         
            +
                      seq_id = index_sequences[seq.source]
         
     | 
| 
      
 528 
     | 
    
         
            +
                      next unless seq_id
         
     | 
| 
      
 529 
     | 
    
         
            +
                      seq_end = seq.start + seq.size
         
     | 
| 
      
 530 
     | 
    
         
            +
                      bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
         
     | 
| 
      
 531 
     | 
    
         
            +
                      key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
         
     | 
| 
      
 532 
     | 
    
         
            +
                      h[key] = val
         
     | 
| 
      
 533 
     | 
    
         
            +
                    end
         
     | 
| 
      
 534 
     | 
    
         
            +
                    return h
         
     | 
| 
      
 535 
     | 
    
         
            +
                  end
         
     | 
| 
      
 536 
     | 
    
         
            +
                end # class KyotoIndex
         
     | 
| 
      
 537 
     | 
    
         
            +
             
     | 
| 
      
 538 
     | 
    
         
            +
                class Filter
         
     | 
| 
      
 539 
     | 
    
         
            +
                  include KVHelpers
         
     | 
| 
      
 540 
     | 
    
         
            +
             
     | 
| 
      
 541 
     | 
    
         
            +
                  def call(e)
         
     | 
| 
      
 542 
     | 
    
         
            +
                    match(e)
         
     | 
| 
      
 543 
     | 
    
         
            +
                  end
         
     | 
| 
      
 544 
     | 
    
         
            +
                end
         
     | 
| 
      
 545 
     | 
    
         
            +
             
     | 
| 
      
 546 
     | 
    
         
            +
                class AllSpeciesFilter < Filter
         
     | 
| 
      
 547 
     | 
    
         
            +
                  attr_reader :bs
         
     | 
| 
      
 548 
     | 
    
         
            +
                  def initialize(species, idx)
         
     | 
| 
      
 549 
     | 
    
         
            +
                    ids = species.collect {|s| 1 << idx.species.fetch(s) }
         
     | 
| 
      
 550 
     | 
    
         
            +
                    @mask = ids.reduce(0, :|)
         
     | 
| 
      
 551 
     | 
    
         
            +
                  end
         
     | 
| 
      
 552 
     | 
    
         
            +
             
     | 
| 
      
 553 
     | 
    
         
            +
                  def match(entry)
         
     | 
| 
      
 554 
     | 
    
         
            +
                    vec = extract_species_vec(entry)
         
     | 
| 
      
 555 
     | 
    
         
            +
                    (@mask & vec) == @mask
         
     | 
| 
      
 556 
     | 
    
         
            +
                  end
         
     | 
| 
      
 557 
     | 
    
         
            +
                end
         
     | 
| 
      
 558 
     | 
    
         
            +
             
     | 
| 
      
 559 
     | 
    
         
            +
                class AtLeastNSequencesFilter < Filter
         
     | 
| 
      
 560 
     | 
    
         
            +
                  attr_reader :n
         
     | 
| 
      
 561 
     | 
    
         
            +
                  def initialize(n, idx)
         
     | 
| 
      
 562 
     | 
    
         
            +
                    @n = n
         
     | 
| 
      
 563 
     | 
    
         
            +
                  end
         
     | 
| 
      
 564 
     | 
    
         
            +
             
     | 
| 
      
 565 
     | 
    
         
            +
                  def match(entry)
         
     | 
| 
      
 566 
     | 
    
         
            +
                    extract_n_sequences(entry) >= @n
         
     | 
| 
      
 567 
     | 
    
         
            +
                  end
         
     | 
| 
      
 568 
     | 
    
         
            +
                end
         
     | 
| 
      
 569 
     | 
    
         
            +
             
     | 
| 
      
 570 
     | 
    
         
            +
                class MaxSizeFilter < Filter
         
     | 
| 
      
 571 
     | 
    
         
            +
                  def initialize(n, idx)
         
     | 
| 
      
 572 
     | 
    
         
            +
                    @n = n
         
     | 
| 
      
 573 
     | 
    
         
            +
                  end
         
     | 
| 
      
 574 
     | 
    
         
            +
                  def match(entry)
         
     | 
| 
      
 575 
     | 
    
         
            +
                    extract_text_size(entry) <= @n
         
     | 
| 
      
 576 
     | 
    
         
            +
                  end
         
     | 
| 
      
 577 
     | 
    
         
            +
                end
         
     | 
| 
      
 578 
     | 
    
         
            +
             
     | 
| 
      
 579 
     | 
    
         
            +
                class MinSizeFilter < Filter
         
     | 
| 
      
 580 
     | 
    
         
            +
                  def initialize(n, idx)
         
     | 
| 
      
 581 
     | 
    
         
            +
                    @n = n
         
     | 
| 
      
 582 
     | 
    
         
            +
                  end
         
     | 
| 
      
 583 
     | 
    
         
            +
                  def match(entry)
         
     | 
| 
      
 584 
     | 
    
         
            +
                    extract_text_size(entry) >= @n
         
     | 
| 
      
 585 
     | 
    
         
            +
                  end
         
     | 
| 
      
 586 
     | 
    
         
            +
                end
         
     | 
| 
      
 587 
     | 
    
         
            +
             
     | 
| 
      
 588 
     | 
    
         
            +
                class Filters
         
     | 
| 
      
 589 
     | 
    
         
            +
                  include KVHelpers
         
     | 
| 
      
 590 
     | 
    
         
            +
             
     | 
| 
      
 591 
     | 
    
         
            +
                  FILTER_CLASSES = {
         
     | 
| 
      
 592 
     | 
    
         
            +
                    :with_all_species => MAF::AllSpeciesFilter,
         
     | 
| 
      
 593 
     | 
    
         
            +
                    :at_least_n_sequences => MAF::AtLeastNSequencesFilter,
         
     | 
| 
      
 594 
     | 
    
         
            +
                    :min_size => MAF::MinSizeFilter,
         
     | 
| 
      
 595 
     | 
    
         
            +
                    :max_size => MAF::MaxSizeFilter
         
     | 
| 
      
 596 
     | 
    
         
            +
                  }
         
     | 
| 
      
 597 
     | 
    
         
            +
             
     | 
| 
      
 598 
     | 
    
         
            +
                  def self.build(spec, idx)
         
     | 
| 
      
 599 
     | 
    
         
            +
                    l = spec.collect do |key, val|
         
     | 
| 
      
 600 
     | 
    
         
            +
                      if FILTER_CLASSES.has_key? key
         
     | 
| 
      
 601 
     | 
    
         
            +
                        FILTER_CLASSES[key].new(val, idx)
         
     | 
| 
      
 602 
     | 
    
         
            +
                      else
         
     | 
| 
      
 603 
     | 
    
         
            +
                        raise "Unsupported filter key #{key}!"
         
     | 
| 
      
 604 
     | 
    
         
            +
                      end
         
     | 
| 
      
 605 
     | 
    
         
            +
                    end
         
     | 
| 
      
 606 
     | 
    
         
            +
                    return Filters.new(l)
         
     | 
| 
      
 607 
     | 
    
         
            +
                  end
         
     | 
| 
      
 608 
     | 
    
         
            +
             
     | 
| 
      
 609 
     | 
    
         
            +
                  def initialize(l)
         
     | 
| 
      
 610 
     | 
    
         
            +
                    @l = l
         
     | 
| 
      
 611 
     | 
    
         
            +
                  end
         
     | 
| 
      
 612 
     | 
    
         
            +
             
     | 
| 
      
 613 
     | 
    
         
            +
                  def match(entry)
         
     | 
| 
      
 614 
     | 
    
         
            +
                    return ! @l.find { |f| ! f.call(entry) }
         
     | 
| 
      
 615 
     | 
    
         
            +
                  end
         
     | 
| 
      
 616 
     | 
    
         
            +
                end
         
     | 
| 
      
 617 
     | 
    
         
            +
             
     | 
| 
      
 618 
     | 
    
         
            +
              end # module MAF
         
     | 
| 
      
 619 
     | 
    
         
            +
              
         
     | 
| 
      
 620 
     | 
    
         
            +
            end
         
     |