bio-ucsc-api 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +36 -16
- data/Rakefile +3 -3
- data/VERSION +1 -1
- data/bio-ucsc-api.gemspec +9 -7
- data/lib/bio-ucsc/hg18.rb +2 -4
- data/lib/bio-ucsc/hg18/activerecord.rb +1 -1
- data/lib/bio-ucsc/hg18/all_bacends.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpiafrate2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnplocke.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpredon.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpsebat2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpsharp2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnptuzun.rb +0 -1
- data/lib/bio-ucsc/hg18/cytoband.rb +0 -1
- data/lib/bio-ucsc/hg18/db_connection.rb +1 -1
- data/lib/bio-ucsc/hg18/delconrad2.rb +0 -1
- data/lib/bio-ucsc/hg18/delhinds2.rb +0 -1
- data/lib/bio-ucsc/hg18/delmccarroll.rb +0 -1
- data/lib/bio-ucsc/hg18/dgv.rb +0 -1
- data/lib/bio-ucsc/hg18/ensgene.rb +0 -1
- data/lib/bio-ucsc/hg18/exaptedrepeats.rb +0 -1
- data/lib/bio-ucsc/hg18/hgcentral_wikitrack.rb +0 -1
- data/lib/bio-ucsc/hg18/kgprotmap2.rb +0 -1
- data/lib/bio-ucsc/hg18/kgtargetali.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc10.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc11.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc12.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc13.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc14.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc8.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc9.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscg248.rb +0 -1
- data/lib/bio-ucsc/hg18/reference.rb +4 -163
- data/lib/bio-ucsc/hg18/reference_sequence.rb +171 -0
- data/lib/bio-ucsc/hg18/refgene.rb +0 -1
- data/lib/bio-ucsc/hg18/snp130.rb +0 -1
- data/lib/bio-ucsc/hg19.rb +2 -1
- data/lib/bio-ucsc/hg19/activerecord.rb +1 -1
- data/lib/bio-ucsc/hg19/cytoband.rb +0 -1
- data/lib/bio-ucsc/hg19/cytobandideo.rb +0 -1
- data/lib/bio-ucsc/hg19/db_connection.rb +1 -1
- data/lib/bio-ucsc/hg19/dgv.rb +0 -1
- data/lib/bio-ucsc/hg19/ensgene.rb +0 -1
- data/lib/bio-ucsc/hg19/reference.rb +4 -163
- data/lib/bio-ucsc/hg19/reference_sequence.rb +171 -0
- data/lib/bio-ucsc/hg19/refgene.rb +0 -1
- data/lib/bio-ucsc/hg19/snp131.rb +0 -1
- data/lib/bio-ucsc/hg19/wgrna.rb +0 -1
- data/samples/hg19-2bit-retrieve.rb +2 -2
- data/spec/hg18/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
- data/spec/hg19/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
- metadata +11 -10
| @@ -0,0 +1,171 @@ | |
| 1 | 
            +
            #
         | 
| 2 | 
            +
            # = reference_sequence.rb
         | 
| 3 | 
            +
            # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Copyright::   Cioyrught (C) 2011
         | 
| 6 | 
            +
            #               MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 7 | 
            +
            # License::     Ruby license (Ryby's / GPLv2 dual)
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # require 'bio'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            module Bio
         | 
| 12 | 
            +
              module Ucsc
         | 
| 13 | 
            +
                module Hg18
         | 
| 14 | 
            +
                  TwoBitHeader = 
         | 
| 15 | 
            +
                    Struct.new(:signature, :version, :sequence_count, :reserved)
         | 
| 16 | 
            +
                  TwoBitRecord =
         | 
| 17 | 
            +
                    Struct.new(:dna_size,
         | 
| 18 | 
            +
                               :n_block_intervals, :mask_block_intervals,
         | 
| 19 | 
            +
                               :reserved, :packed_dna_offset)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  class ByteQueue
         | 
| 22 | 
            +
                    def initialize(str)
         | 
| 23 | 
            +
                      @str = str
         | 
| 24 | 
            +
                      @index = 0
         | 
| 25 | 
            +
                    end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    attr_accessor :index
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    def next(n)
         | 
| 30 | 
            +
                      result = @str[@index, n]
         | 
| 31 | 
            +
                      @index += n
         | 
| 32 | 
            +
                      result
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
                  end # class ByteQueue
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  class ReferenceSequence
         | 
| 37 | 
            +
                    BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    cattr_reader :filename, :header, :offsets
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    def self.load(filename)
         | 
| 42 | 
            +
                      two_bit = nil
         | 
| 43 | 
            +
                      open(filename, 'rb') {|f| two_bit = f.read}
         | 
| 44 | 
            +
                      @@tbq = ByteQueue.new(two_bit)
         | 
| 45 | 
            +
                      @@filename = filename
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                      twobit_header = TwoBitHeader.new
         | 
| 48 | 
            +
                      twobit_header.signature      = @@tbq.next(4).unpack('L').first
         | 
| 49 | 
            +
                      twobit_header.version        = @@tbq.next(4).unpack('L').first
         | 
| 50 | 
            +
                      twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
         | 
| 51 | 
            +
                      twobit_header.reserved       = @@tbq.next(4).unpack('L').first
         | 
| 52 | 
            +
                      @@header = twobit_header
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                      @@offsets = Hash.new
         | 
| 55 | 
            +
                      @@header.sequence_count.times do
         | 
| 56 | 
            +
                        name_length = @@tbq.next(1).unpack('C').first
         | 
| 57 | 
            +
                        @@offsets[@@tbq.next(name_length).unpack('a*').first] =
         | 
| 58 | 
            +
                          @@tbq.next(4).unpack('L').first
         | 
| 59 | 
            +
                      end
         | 
| 60 | 
            +
                      @@records = Hash.new
         | 
| 61 | 
            +
                    end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    def self.records(chrom)
         | 
| 64 | 
            +
                      return @@records[chrom] if @@records[chrom]
         | 
| 65 | 
            +
                      
         | 
| 66 | 
            +
                      @@tbq.index = @@offsets[chrom]
         | 
| 67 | 
            +
                      @@records[chrom] = TwoBitRecord.new
         | 
| 68 | 
            +
                      @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
         | 
| 69 | 
            +
             
         | 
| 70 | 
            +
                      n_block_count = @@tbq.next(4).unpack('L').first
         | 
| 71 | 
            +
                      n_block_starts = Array.new
         | 
| 72 | 
            +
                      n_block_count.times do
         | 
| 73 | 
            +
                        n_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 74 | 
            +
                      end
         | 
| 75 | 
            +
                      n_block_sizes = Array.new
         | 
| 76 | 
            +
                      n_block_count.times do
         | 
| 77 | 
            +
                        n_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 78 | 
            +
                      end
         | 
| 79 | 
            +
                      @@records[chrom].n_block_intervals = Array.new
         | 
| 80 | 
            +
                      n_block_count.times do |idx|
         | 
| 81 | 
            +
                        @@records[chrom].n_block_intervals << 
         | 
| 82 | 
            +
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 83 | 
            +
                                                         n_block_starts[idx],
         | 
| 84 | 
            +
                                                         n_block_starts[idx]+n_block_sizes[idx])
         | 
| 85 | 
            +
                      end
         | 
| 86 | 
            +
                      
         | 
| 87 | 
            +
                      mask_block_count = @@tbq.next(4).unpack('L').first
         | 
| 88 | 
            +
                      mask_block_starts = Array.new
         | 
| 89 | 
            +
                      mask_block_count.times do
         | 
| 90 | 
            +
                        mask_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 91 | 
            +
                      end
         | 
| 92 | 
            +
                      mask_block_sizes = Array.new
         | 
| 93 | 
            +
                      mask_block_count.times do
         | 
| 94 | 
            +
                        mask_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 95 | 
            +
                      end
         | 
| 96 | 
            +
                      @@records[chrom].mask_block_intervals = Array.new
         | 
| 97 | 
            +
                      mask_block_count.times do |idx|
         | 
| 98 | 
            +
                        @@records[chrom].mask_block_intervals << 
         | 
| 99 | 
            +
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 100 | 
            +
                                                         mask_block_starts[idx],
         | 
| 101 | 
            +
                                                         mask_block_starts[idx]+mask_block_sizes[idx])
         | 
| 102 | 
            +
                      end
         | 
| 103 | 
            +
              
         | 
| 104 | 
            +
                      @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
         | 
| 105 | 
            +
                      @@records[chrom].packed_dna_offset = @@tbq.index
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                      @@records[chrom]
         | 
| 108 | 
            +
                    end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    def self.find_by_interval(interval)
         | 
| 111 | 
            +
                      seq = self.find_by_interval_raw(interval)
         | 
| 112 | 
            +
                      @@records[interval.chrom].n_block_intervals.map do |nb|
         | 
| 113 | 
            +
                        if interval.overlapped?(nb)
         | 
| 114 | 
            +
                          case interval.compare(nb)
         | 
| 115 | 
            +
                          when :equal,:contained_by
         | 
| 116 | 
            +
                            seq = 'N' * interval.overlap(nb)
         | 
| 117 | 
            +
                          when :contains
         | 
| 118 | 
            +
                            left_len  = nb.chr_start - interval.chr_start + 1
         | 
| 119 | 
            +
                            right_len = interval.chr_end - nb.chr_end + 1
         | 
| 120 | 
            +
                            seq[0, left_len] = 'N' * left_len
         | 
| 121 | 
            +
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 122 | 
            +
                          when :left_overlapped
         | 
| 123 | 
            +
                            left_len = nb.chr_end - interval.chr_start + 1
         | 
| 124 | 
            +
                            seq[0, left_len] = 'N' * left_len
         | 
| 125 | 
            +
                          when :right_overlapped
         | 
| 126 | 
            +
                            right_len = interval.chr_end - nb.chr_start + 1
         | 
| 127 | 
            +
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 128 | 
            +
                          when :right_adjacent, :right_off
         | 
| 129 | 
            +
                            # expecting that N-blocks are sorted
         | 
| 130 | 
            +
                            # return Bio::Sequence::NA.new(seq) 
         | 
| 131 | 
            +
                            seq
         | 
| 132 | 
            +
                          end
         | 
| 133 | 
            +
                        end
         | 
| 134 | 
            +
                      end
         | 
| 135 | 
            +
                      #Bio::Sequence::NA.new(seq)
         | 
| 136 | 
            +
                      seq
         | 
| 137 | 
            +
                    end
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    def self.find_by_interval_raw(interval)
         | 
| 140 | 
            +
                      byte_count, byte_mod = interval.zero_start.divmod 4
         | 
| 141 | 
            +
                      chrom_top = self.records(interval.chrom).packed_dna_offset
         | 
| 142 | 
            +
                      div_start, mod_start = interval.zero_start.divmod 4
         | 
| 143 | 
            +
                      div_end, mod_end     = interval.zero_end.divmod 4
         | 
| 144 | 
            +
                      div_len, mod_len     = interval.length.divmod 4
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                      byte_length = div_end - div_start + 1
         | 
| 147 | 
            +
                      @@tbq.index = chrom_top + div_start
         | 
| 148 | 
            +
                      bytes = @@tbq.next(byte_length).unpack('C*')
         | 
| 149 | 
            +
                      seq = Bio::Ucsc::Hg18::ReferenceSequence.bytes_to_nucleotides(bytes)
         | 
| 150 | 
            +
                      seq[mod_start..(-1-(4-mod_end))]
         | 
| 151 | 
            +
                    end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    def self.bytes_to_nucleotides(bytes) 
         | 
| 154 | 
            +
                      results = ""
         | 
| 155 | 
            +
                      bytes.each do |byte|
         | 
| 156 | 
            +
                        results << Bio::Ucsc::Hg18::ReferenceSequence.byte_to_nucleotides(byte)
         | 
| 157 | 
            +
                      end
         | 
| 158 | 
            +
                      results
         | 
| 159 | 
            +
                    end
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    def self.byte_to_nucleotides(byte)
         | 
| 162 | 
            +
                      BINCODE[byte >> 6] +
         | 
| 163 | 
            +
                        BINCODE[(byte >> 4) & 0b11] +
         | 
| 164 | 
            +
                        BINCODE[(byte >> 2) & 0b11] +
         | 
| 165 | 
            +
                        BINCODE[byte & 0b11]
         | 
| 166 | 
            +
                    end
         | 
| 167 | 
            +
                  end # class ReferenceSequence
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                end # module Hg18
         | 
| 170 | 
            +
              end # module Ucsc
         | 
| 171 | 
            +
            end # module Bio
         | 
| @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg18/refgene.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
    
        data/lib/bio-ucsc/hg18/snp130.rb
    CHANGED
    
    | @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg18/snp130.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
    
        data/lib/bio-ucsc/hg19.rb
    CHANGED
    
    | @@ -16,7 +16,8 @@ module Bio | |
| 16 16 |  | 
| 17 17 | 
             
                  # Reference sequence retrieval via the 2bit fil
         | 
| 18 18 | 
             
                  #
         | 
| 19 | 
            -
                  autoload :Reference, | 
| 19 | 
            +
                  autoload :Reference, "#{base}/reference" # OBSOLETE
         | 
| 20 | 
            +
                  autoload :ReferenceSequence, "#{base}/reference_sequence"
         | 
| 20 21 |  | 
| 21 22 | 
             
                  # group: Mapping and Sequencing Tracks ----------
         | 
| 22 23 | 
             
                  #
         | 
| @@ -3,7 +3,7 @@ | |
| 3 3 | 
             
            # 
         | 
| 4 4 | 
             
            # Copyright::
         | 
| 5 5 | 
             
            #   Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
         | 
| 6 | 
            -
            #   Copyright (C)  | 
| 6 | 
            +
            #   Copyright (C) 2010 Jan Aerts <jan.aerts@gmail.com>
         | 
| 7 7 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 8 8 | 
             
            #
         | 
| 9 9 | 
             
            # = DESCRIPTION
         | 
| @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg19/cytoband.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
| @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg19/cytobandideo.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
    
        data/lib/bio-ucsc/hg19/dgv.rb
    CHANGED
    
    | @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg19/dgv.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
| @@ -2,7 +2,6 @@ | |
| 2 2 | 
             
            # = hg19/ensgene.rb
         | 
| 3 3 | 
             
            # Copyright::
         | 
| 4 4 | 
             
            #  Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 5 | 
            -
            #  Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
         | 
| 6 5 | 
             
            # License::     The Ruby licence (Ryby's / GPLv2 dual)
         | 
| 7 6 | 
             
            #
         | 
| 8 7 | 
             
            # = Table desfription in UCSC Table Browser
         | 
| @@ -2,170 +2,11 @@ | |
| 2 2 | 
             
            # = reference.rb
         | 
| 3 3 | 
             
            # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
         | 
| 4 4 | 
             
            #
         | 
| 5 | 
            +
            # In version 0.1.0, this file is OBSOLETE.
         | 
| 6 | 
            +
            # Use Ucsc::Hg18::ReferenceSequence instead.
         | 
| 7 | 
            +
            #
         | 
| 5 8 | 
             
            # Copyright::   Cioyrught (C) 2011
         | 
| 6 9 | 
             
            #               MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 7 10 | 
             
            # License::     Ruby license (Ryby's / GPLv2 dual)
         | 
| 8 11 |  | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
            module Bio
         | 
| 12 | 
            -
              module Ucsc
         | 
| 13 | 
            -
                module Hg19
         | 
| 14 | 
            -
                  TwoBitHeader = 
         | 
| 15 | 
            -
                    Struct.new(:signature, :version, :sequence_count, :reserved)
         | 
| 16 | 
            -
                  TwoBitRecord =
         | 
| 17 | 
            -
                    Struct.new(:dna_size,
         | 
| 18 | 
            -
                               :n_block_intervals, :mask_block_intervals,
         | 
| 19 | 
            -
                               :reserved, :packed_dna_offset)
         | 
| 20 | 
            -
             | 
| 21 | 
            -
                  class ByteQueue
         | 
| 22 | 
            -
                    def initialize(str)
         | 
| 23 | 
            -
                      @str = str
         | 
| 24 | 
            -
                      @index = 0
         | 
| 25 | 
            -
                    end
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                    attr_accessor :index
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                    def next(n)
         | 
| 30 | 
            -
                      result = @str[@index, n]
         | 
| 31 | 
            -
                      @index += n
         | 
| 32 | 
            -
                      result
         | 
| 33 | 
            -
                    end
         | 
| 34 | 
            -
                  end # class ByteQueue
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                  class Reference
         | 
| 37 | 
            -
                    BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                    cattr_reader :filename, :header, :offsets
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                    def self.load(filename)
         | 
| 42 | 
            -
                      two_bit = nil
         | 
| 43 | 
            -
                      open(filename, 'rb') {|f| two_bit = f.read}
         | 
| 44 | 
            -
                      @@tbq = ByteQueue.new(two_bit)
         | 
| 45 | 
            -
                      @@filename = filename
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                      twobit_header = TwoBitHeader.new
         | 
| 48 | 
            -
                      twobit_header.signature      = @@tbq.next(4).unpack('L').first
         | 
| 49 | 
            -
                      twobit_header.version        = @@tbq.next(4).unpack('L').first
         | 
| 50 | 
            -
                      twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
         | 
| 51 | 
            -
                      twobit_header.reserved       = @@tbq.next(4).unpack('L').first
         | 
| 52 | 
            -
                      @@header = twobit_header
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                      @@offsets = Hash.new
         | 
| 55 | 
            -
                      @@header.sequence_count.times do
         | 
| 56 | 
            -
                        name_length = @@tbq.next(1).unpack('C').first
         | 
| 57 | 
            -
                        @@offsets[@@tbq.next(name_length).unpack('a*').first] =
         | 
| 58 | 
            -
                          @@tbq.next(4).unpack('L').first
         | 
| 59 | 
            -
                      end
         | 
| 60 | 
            -
                      @@records = Hash.new
         | 
| 61 | 
            -
                    end
         | 
| 62 | 
            -
             | 
| 63 | 
            -
                    def self.records(chrom)
         | 
| 64 | 
            -
                      return @@records[chrom] if @@records[chrom]
         | 
| 65 | 
            -
                      
         | 
| 66 | 
            -
                      @@tbq.index = @@offsets[chrom]
         | 
| 67 | 
            -
                      @@records[chrom] = TwoBitRecord.new
         | 
| 68 | 
            -
                      @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
         | 
| 69 | 
            -
             
         | 
| 70 | 
            -
                      n_block_count = @@tbq.next(4).unpack('L').first
         | 
| 71 | 
            -
                      n_block_starts = Array.new
         | 
| 72 | 
            -
                      n_block_count.times do
         | 
| 73 | 
            -
                        n_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 74 | 
            -
                      end
         | 
| 75 | 
            -
                      n_block_sizes = Array.new
         | 
| 76 | 
            -
                      n_block_count.times do
         | 
| 77 | 
            -
                        n_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 78 | 
            -
                      end
         | 
| 79 | 
            -
                      @@records[chrom].n_block_intervals = Array.new
         | 
| 80 | 
            -
                      n_block_count.times do |idx|
         | 
| 81 | 
            -
                        @@records[chrom].n_block_intervals << 
         | 
| 82 | 
            -
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 83 | 
            -
                                                         n_block_starts[idx],
         | 
| 84 | 
            -
                                                         n_block_starts[idx]+n_block_sizes[idx])
         | 
| 85 | 
            -
                      end
         | 
| 86 | 
            -
                      
         | 
| 87 | 
            -
                      mask_block_count = @@tbq.next(4).unpack('L').first
         | 
| 88 | 
            -
                      mask_block_starts = Array.new
         | 
| 89 | 
            -
                      mask_block_count.times do
         | 
| 90 | 
            -
                        mask_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 91 | 
            -
                      end
         | 
| 92 | 
            -
                      mask_block_sizes = Array.new
         | 
| 93 | 
            -
                      mask_block_count.times do
         | 
| 94 | 
            -
                        mask_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 95 | 
            -
                      end
         | 
| 96 | 
            -
                      @@records[chrom].mask_block_intervals = Array.new
         | 
| 97 | 
            -
                      mask_block_count.times do |idx|
         | 
| 98 | 
            -
                        @@records[chrom].mask_block_intervals << 
         | 
| 99 | 
            -
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 100 | 
            -
                                                         mask_block_starts[idx],
         | 
| 101 | 
            -
                                                         mask_block_starts[idx]+mask_block_sizes[idx])
         | 
| 102 | 
            -
                      end
         | 
| 103 | 
            -
              
         | 
| 104 | 
            -
                      @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
         | 
| 105 | 
            -
                      @@records[chrom].packed_dna_offset = @@tbq.index
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                      @@records[chrom]
         | 
| 108 | 
            -
                    end
         | 
| 109 | 
            -
             | 
| 110 | 
            -
                    def self.find_by_interval(interval)
         | 
| 111 | 
            -
                      seq = self.find_by_interval_raw(interval)
         | 
| 112 | 
            -
                      @@records[interval.chrom].n_block_intervals.map do |nb|
         | 
| 113 | 
            -
                        if interval.overlapped?(nb)
         | 
| 114 | 
            -
                          case interval.compare(nb)
         | 
| 115 | 
            -
                          when :equal,:contained_by
         | 
| 116 | 
            -
                            seq = 'N' * interval.overlap(nb)
         | 
| 117 | 
            -
                          when :contains
         | 
| 118 | 
            -
                            left_len  = nb.chr_start - interval.chr_start + 1
         | 
| 119 | 
            -
                            right_len = interval.chr_end - nb.chr_end + 1
         | 
| 120 | 
            -
                            seq[0, left_len] = 'N' * left_len
         | 
| 121 | 
            -
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 122 | 
            -
                          when :left_overlapped
         | 
| 123 | 
            -
                            left_len = nb.chr_end - interval.chr_start + 1
         | 
| 124 | 
            -
                            seq[0, left_len] = 'N' * left_len
         | 
| 125 | 
            -
                          when :right_overlapped
         | 
| 126 | 
            -
                            right_len = interval.chr_end - nb.chr_start + 1
         | 
| 127 | 
            -
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 128 | 
            -
                          when :right_adjacent, :right_off
         | 
| 129 | 
            -
                            # expecting that N-blocks are sorted
         | 
| 130 | 
            -
                            # return Bio::Sequence::NA.new(seq) 
         | 
| 131 | 
            -
                            seq
         | 
| 132 | 
            -
                          end
         | 
| 133 | 
            -
                        end
         | 
| 134 | 
            -
                      end
         | 
| 135 | 
            -
                      #Bio::Sequence::NA.new(seq)
         | 
| 136 | 
            -
                      seq
         | 
| 137 | 
            -
                    end
         | 
| 138 | 
            -
             | 
| 139 | 
            -
                    def self.find_by_interval_raw(interval)
         | 
| 140 | 
            -
                      byte_count, byte_mod = interval.zero_start.divmod 4
         | 
| 141 | 
            -
                      chrom_top = self.records(interval.chrom).packed_dna_offset
         | 
| 142 | 
            -
                      div_start, mod_start = interval.zero_start.divmod 4
         | 
| 143 | 
            -
                      div_end, mod_end     = interval.zero_end.divmod 4
         | 
| 144 | 
            -
                      div_len, mod_len     = interval.length.divmod 4
         | 
| 145 | 
            -
             | 
| 146 | 
            -
                      byte_length = div_end - div_start + 1
         | 
| 147 | 
            -
                      @@tbq.index = chrom_top + div_start
         | 
| 148 | 
            -
                      bytes = @@tbq.next(byte_length).unpack('C*')
         | 
| 149 | 
            -
                      seq = Bio::Ucsc::Hg19::Reference.bytes_to_nucleotides(bytes)
         | 
| 150 | 
            -
                      seq[mod_start..(-1-(4-mod_end))]
         | 
| 151 | 
            -
                    end
         | 
| 152 | 
            -
             | 
| 153 | 
            -
                    def self.bytes_to_nucleotides(bytes) 
         | 
| 154 | 
            -
                      results = ""
         | 
| 155 | 
            -
                      bytes.each do |byte|
         | 
| 156 | 
            -
                        results << Bio::Ucsc::Hg19::Reference.byte_to_nucleotides(byte)
         | 
| 157 | 
            -
                      end
         | 
| 158 | 
            -
                      results
         | 
| 159 | 
            -
                    end
         | 
| 160 | 
            -
             | 
| 161 | 
            -
                    def self.byte_to_nucleotides(byte)
         | 
| 162 | 
            -
                      BINCODE[byte >> 6] +
         | 
| 163 | 
            -
                        BINCODE[(byte >> 4) & 0b11] +
         | 
| 164 | 
            -
                        BINCODE[(byte >> 2) & 0b11] +
         | 
| 165 | 
            -
                        BINCODE[byte & 0b11]
         | 
| 166 | 
            -
                    end
         | 
| 167 | 
            -
                  end # class Reference
         | 
| 168 | 
            -
             | 
| 169 | 
            -
                end # module Hg19
         | 
| 170 | 
            -
              end # module Ucsc
         | 
| 171 | 
            -
            end # module Bio
         | 
| 12 | 
            +
            raise "Bio::Ucsc::Hg19::Reference is OBSOLETE. Use Bio::Ucsc::Hg19::ReferenceSequence instead."
         | 
| @@ -0,0 +1,171 @@ | |
| 1 | 
            +
            #
         | 
| 2 | 
            +
            # = reference_sequence.rb
         | 
| 3 | 
            +
            # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Copyright::   Cioyrught (C) 2011
         | 
| 6 | 
            +
            #               MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp> 
         | 
| 7 | 
            +
            # License::     Ruby license (Ryby's / GPLv2 dual)
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # require 'bio'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            module Bio
         | 
| 12 | 
            +
              module Ucsc
         | 
| 13 | 
            +
                module Hg19
         | 
| 14 | 
            +
                  TwoBitHeader = 
         | 
| 15 | 
            +
                    Struct.new(:signature, :version, :sequence_count, :reserved)
         | 
| 16 | 
            +
                  TwoBitRecord =
         | 
| 17 | 
            +
                    Struct.new(:dna_size,
         | 
| 18 | 
            +
                               :n_block_intervals, :mask_block_intervals,
         | 
| 19 | 
            +
                               :reserved, :packed_dna_offset)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  class ByteQueue
         | 
| 22 | 
            +
                    def initialize(str)
         | 
| 23 | 
            +
                      @str = str
         | 
| 24 | 
            +
                      @index = 0
         | 
| 25 | 
            +
                    end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    attr_accessor :index
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    def next(n)
         | 
| 30 | 
            +
                      result = @str[@index, n]
         | 
| 31 | 
            +
                      @index += n
         | 
| 32 | 
            +
                      result
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
                  end # class ByteQueue
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  class ReferenceSequence
         | 
| 37 | 
            +
                    BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    cattr_reader :filename, :header, :offsets
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    def self.load(filename)
         | 
| 42 | 
            +
                      two_bit = nil
         | 
| 43 | 
            +
                      open(filename, 'rb') {|f| two_bit = f.read}
         | 
| 44 | 
            +
                      @@tbq = ByteQueue.new(two_bit)
         | 
| 45 | 
            +
                      @@filename = filename
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                      twobit_header = TwoBitHeader.new
         | 
| 48 | 
            +
                      twobit_header.signature      = @@tbq.next(4).unpack('L').first
         | 
| 49 | 
            +
                      twobit_header.version        = @@tbq.next(4).unpack('L').first
         | 
| 50 | 
            +
                      twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
         | 
| 51 | 
            +
                      twobit_header.reserved       = @@tbq.next(4).unpack('L').first
         | 
| 52 | 
            +
                      @@header = twobit_header
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                      @@offsets = Hash.new
         | 
| 55 | 
            +
                      @@header.sequence_count.times do
         | 
| 56 | 
            +
                        name_length = @@tbq.next(1).unpack('C').first
         | 
| 57 | 
            +
                        @@offsets[@@tbq.next(name_length).unpack('a*').first] =
         | 
| 58 | 
            +
                          @@tbq.next(4).unpack('L').first
         | 
| 59 | 
            +
                      end
         | 
| 60 | 
            +
                      @@records = Hash.new
         | 
| 61 | 
            +
                    end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    def self.records(chrom)
         | 
| 64 | 
            +
                      return @@records[chrom] if @@records[chrom]
         | 
| 65 | 
            +
                      
         | 
| 66 | 
            +
                      @@tbq.index = @@offsets[chrom]
         | 
| 67 | 
            +
                      @@records[chrom] = TwoBitRecord.new
         | 
| 68 | 
            +
                      @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
         | 
| 69 | 
            +
             
         | 
| 70 | 
            +
                      n_block_count = @@tbq.next(4).unpack('L').first
         | 
| 71 | 
            +
                      n_block_starts = Array.new
         | 
| 72 | 
            +
                      n_block_count.times do
         | 
| 73 | 
            +
                        n_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 74 | 
            +
                      end
         | 
| 75 | 
            +
                      n_block_sizes = Array.new
         | 
| 76 | 
            +
                      n_block_count.times do
         | 
| 77 | 
            +
                        n_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 78 | 
            +
                      end
         | 
| 79 | 
            +
                      @@records[chrom].n_block_intervals = Array.new
         | 
| 80 | 
            +
                      n_block_count.times do |idx|
         | 
| 81 | 
            +
                        @@records[chrom].n_block_intervals << 
         | 
| 82 | 
            +
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 83 | 
            +
                                                         n_block_starts[idx],
         | 
| 84 | 
            +
                                                         n_block_starts[idx]+n_block_sizes[idx])
         | 
| 85 | 
            +
                      end
         | 
| 86 | 
            +
                      
         | 
| 87 | 
            +
                      mask_block_count = @@tbq.next(4).unpack('L').first
         | 
| 88 | 
            +
                      mask_block_starts = Array.new
         | 
| 89 | 
            +
                      mask_block_count.times do
         | 
| 90 | 
            +
                        mask_block_starts << @@tbq.next(4).unpack('L').first
         | 
| 91 | 
            +
                      end
         | 
| 92 | 
            +
                      mask_block_sizes = Array.new
         | 
| 93 | 
            +
                      mask_block_count.times do
         | 
| 94 | 
            +
                        mask_block_sizes << @@tbq.next(4).unpack('L').first
         | 
| 95 | 
            +
                      end
         | 
| 96 | 
            +
                      @@records[chrom].mask_block_intervals = Array.new
         | 
| 97 | 
            +
                      mask_block_count.times do |idx|
         | 
| 98 | 
            +
                        @@records[chrom].mask_block_intervals << 
         | 
| 99 | 
            +
                          Bio::GenomicInterval.zero_based(chrom,
         | 
| 100 | 
            +
                                                         mask_block_starts[idx],
         | 
| 101 | 
            +
                                                         mask_block_starts[idx]+mask_block_sizes[idx])
         | 
| 102 | 
            +
                      end
         | 
| 103 | 
            +
              
         | 
| 104 | 
            +
                      @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
         | 
| 105 | 
            +
                      @@records[chrom].packed_dna_offset = @@tbq.index
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                      @@records[chrom]
         | 
| 108 | 
            +
                    end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    def self.find_by_interval(interval)
         | 
| 111 | 
            +
                      seq = self.find_by_interval_raw(interval)
         | 
| 112 | 
            +
                      @@records[interval.chrom].n_block_intervals.map do |nb|
         | 
| 113 | 
            +
                        if interval.overlapped?(nb)
         | 
| 114 | 
            +
                          case interval.compare(nb)
         | 
| 115 | 
            +
                          when :equal,:contained_by
         | 
| 116 | 
            +
                            seq = 'N' * interval.overlap(nb)
         | 
| 117 | 
            +
                          when :contains
         | 
| 118 | 
            +
                            left_len  = nb.chr_start - interval.chr_start + 1
         | 
| 119 | 
            +
                            right_len = interval.chr_end - nb.chr_end + 1
         | 
| 120 | 
            +
                            seq[0, left_len] = 'N' * left_len
         | 
| 121 | 
            +
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 122 | 
            +
                          when :left_overlapped
         | 
| 123 | 
            +
                            left_len = nb.chr_end - interval.chr_start + 1
         | 
| 124 | 
            +
                            seq[0, left_len] = 'N' * left_len
         | 
| 125 | 
            +
                          when :right_overlapped
         | 
| 126 | 
            +
                            right_len = interval.chr_end - nb.chr_start + 1
         | 
| 127 | 
            +
                            seq[-right_len, right_len] = 'N' * right_len
         | 
| 128 | 
            +
                          when :right_adjacent, :right_off
         | 
| 129 | 
            +
                            # expecting that N-blocks are sorted
         | 
| 130 | 
            +
                            # return Bio::Sequence::NA.new(seq) 
         | 
| 131 | 
            +
                            seq
         | 
| 132 | 
            +
                          end
         | 
| 133 | 
            +
                        end
         | 
| 134 | 
            +
                      end
         | 
| 135 | 
            +
                      #Bio::Sequence::NA.new(seq)
         | 
| 136 | 
            +
                      seq
         | 
| 137 | 
            +
                    end
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    def self.find_by_interval_raw(interval)
         | 
| 140 | 
            +
                      byte_count, byte_mod = interval.zero_start.divmod 4
         | 
| 141 | 
            +
                      chrom_top = self.records(interval.chrom).packed_dna_offset
         | 
| 142 | 
            +
                      div_start, mod_start = interval.zero_start.divmod 4
         | 
| 143 | 
            +
                      div_end, mod_end     = interval.zero_end.divmod 4
         | 
| 144 | 
            +
                      div_len, mod_len     = interval.length.divmod 4
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                      byte_length = div_end - div_start + 1
         | 
| 147 | 
            +
                      @@tbq.index = chrom_top + div_start
         | 
| 148 | 
            +
                      bytes = @@tbq.next(byte_length).unpack('C*')
         | 
| 149 | 
            +
                      seq = Bio::Ucsc::Hg19::ReferenceSequence.bytes_to_nucleotides(bytes)
         | 
| 150 | 
            +
                      seq[mod_start..(-1-(4-mod_end))]
         | 
| 151 | 
            +
                    end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    def self.bytes_to_nucleotides(bytes) 
         | 
| 154 | 
            +
                      results = ""
         | 
| 155 | 
            +
                      bytes.each do |byte|
         | 
| 156 | 
            +
                        results << Bio::Ucsc::Hg19::ReferenceSequence.byte_to_nucleotides(byte)
         | 
| 157 | 
            +
                      end
         | 
| 158 | 
            +
                      results
         | 
| 159 | 
            +
                    end
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    def self.byte_to_nucleotides(byte)
         | 
| 162 | 
            +
                      BINCODE[byte >> 6] +
         | 
| 163 | 
            +
                        BINCODE[(byte >> 4) & 0b11] +
         | 
| 164 | 
            +
                        BINCODE[(byte >> 2) & 0b11] +
         | 
| 165 | 
            +
                        BINCODE[byte & 0b11]
         | 
| 166 | 
            +
                    end
         | 
| 167 | 
            +
                  end # class ReferenceSequence
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                end # module Hg19
         | 
| 170 | 
            +
              end # module Ucsc
         | 
| 171 | 
            +
            end # module Bio
         |