bio-ucsc-api 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +36 -16
- data/Rakefile +3 -3
- data/VERSION +1 -1
- data/bio-ucsc-api.gemspec +9 -7
- data/lib/bio-ucsc/hg18.rb +2 -4
- data/lib/bio-ucsc/hg18/activerecord.rb +1 -1
- data/lib/bio-ucsc/hg18/all_bacends.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpiafrate2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnplocke.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpredon.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpsebat2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnpsharp2.rb +0 -1
- data/lib/bio-ucsc/hg18/cnptuzun.rb +0 -1
- data/lib/bio-ucsc/hg18/cytoband.rb +0 -1
- data/lib/bio-ucsc/hg18/db_connection.rb +1 -1
- data/lib/bio-ucsc/hg18/delconrad2.rb +0 -1
- data/lib/bio-ucsc/hg18/delhinds2.rb +0 -1
- data/lib/bio-ucsc/hg18/delmccarroll.rb +0 -1
- data/lib/bio-ucsc/hg18/dgv.rb +0 -1
- data/lib/bio-ucsc/hg18/ensgene.rb +0 -1
- data/lib/bio-ucsc/hg18/exaptedrepeats.rb +0 -1
- data/lib/bio-ucsc/hg18/hgcentral_wikitrack.rb +0 -1
- data/lib/bio-ucsc/hg18/kgprotmap2.rb +0 -1
- data/lib/bio-ucsc/hg18/kgtargetali.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc10.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc11.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc12.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc13.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc14.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc8.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscabc9.rb +0 -1
- data/lib/bio-ucsc/hg18/kiddeichlerdiscg248.rb +0 -1
- data/lib/bio-ucsc/hg18/reference.rb +4 -163
- data/lib/bio-ucsc/hg18/reference_sequence.rb +171 -0
- data/lib/bio-ucsc/hg18/refgene.rb +0 -1
- data/lib/bio-ucsc/hg18/snp130.rb +0 -1
- data/lib/bio-ucsc/hg19.rb +2 -1
- data/lib/bio-ucsc/hg19/activerecord.rb +1 -1
- data/lib/bio-ucsc/hg19/cytoband.rb +0 -1
- data/lib/bio-ucsc/hg19/cytobandideo.rb +0 -1
- data/lib/bio-ucsc/hg19/db_connection.rb +1 -1
- data/lib/bio-ucsc/hg19/dgv.rb +0 -1
- data/lib/bio-ucsc/hg19/ensgene.rb +0 -1
- data/lib/bio-ucsc/hg19/reference.rb +4 -163
- data/lib/bio-ucsc/hg19/reference_sequence.rb +171 -0
- data/lib/bio-ucsc/hg19/refgene.rb +0 -1
- data/lib/bio-ucsc/hg19/snp131.rb +0 -1
- data/lib/bio-ucsc/hg19/wgrna.rb +0 -1
- data/samples/hg19-2bit-retrieve.rb +2 -2
- data/spec/hg18/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
- data/spec/hg19/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
- metadata +11 -10
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# = reference_sequence.rb
|
3
|
+
# handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
|
4
|
+
#
|
5
|
+
# Copyright:: Cioyrught (C) 2011
|
6
|
+
# MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
7
|
+
# License:: Ruby license (Ryby's / GPLv2 dual)
|
8
|
+
|
9
|
+
# require 'bio'
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module Ucsc
|
13
|
+
module Hg18
|
14
|
+
TwoBitHeader =
|
15
|
+
Struct.new(:signature, :version, :sequence_count, :reserved)
|
16
|
+
TwoBitRecord =
|
17
|
+
Struct.new(:dna_size,
|
18
|
+
:n_block_intervals, :mask_block_intervals,
|
19
|
+
:reserved, :packed_dna_offset)
|
20
|
+
|
21
|
+
class ByteQueue
|
22
|
+
def initialize(str)
|
23
|
+
@str = str
|
24
|
+
@index = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_accessor :index
|
28
|
+
|
29
|
+
def next(n)
|
30
|
+
result = @str[@index, n]
|
31
|
+
@index += n
|
32
|
+
result
|
33
|
+
end
|
34
|
+
end # class ByteQueue
|
35
|
+
|
36
|
+
class ReferenceSequence
|
37
|
+
BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
|
38
|
+
|
39
|
+
cattr_reader :filename, :header, :offsets
|
40
|
+
|
41
|
+
def self.load(filename)
|
42
|
+
two_bit = nil
|
43
|
+
open(filename, 'rb') {|f| two_bit = f.read}
|
44
|
+
@@tbq = ByteQueue.new(two_bit)
|
45
|
+
@@filename = filename
|
46
|
+
|
47
|
+
twobit_header = TwoBitHeader.new
|
48
|
+
twobit_header.signature = @@tbq.next(4).unpack('L').first
|
49
|
+
twobit_header.version = @@tbq.next(4).unpack('L').first
|
50
|
+
twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
|
51
|
+
twobit_header.reserved = @@tbq.next(4).unpack('L').first
|
52
|
+
@@header = twobit_header
|
53
|
+
|
54
|
+
@@offsets = Hash.new
|
55
|
+
@@header.sequence_count.times do
|
56
|
+
name_length = @@tbq.next(1).unpack('C').first
|
57
|
+
@@offsets[@@tbq.next(name_length).unpack('a*').first] =
|
58
|
+
@@tbq.next(4).unpack('L').first
|
59
|
+
end
|
60
|
+
@@records = Hash.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.records(chrom)
|
64
|
+
return @@records[chrom] if @@records[chrom]
|
65
|
+
|
66
|
+
@@tbq.index = @@offsets[chrom]
|
67
|
+
@@records[chrom] = TwoBitRecord.new
|
68
|
+
@@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
|
69
|
+
|
70
|
+
n_block_count = @@tbq.next(4).unpack('L').first
|
71
|
+
n_block_starts = Array.new
|
72
|
+
n_block_count.times do
|
73
|
+
n_block_starts << @@tbq.next(4).unpack('L').first
|
74
|
+
end
|
75
|
+
n_block_sizes = Array.new
|
76
|
+
n_block_count.times do
|
77
|
+
n_block_sizes << @@tbq.next(4).unpack('L').first
|
78
|
+
end
|
79
|
+
@@records[chrom].n_block_intervals = Array.new
|
80
|
+
n_block_count.times do |idx|
|
81
|
+
@@records[chrom].n_block_intervals <<
|
82
|
+
Bio::GenomicInterval.zero_based(chrom,
|
83
|
+
n_block_starts[idx],
|
84
|
+
n_block_starts[idx]+n_block_sizes[idx])
|
85
|
+
end
|
86
|
+
|
87
|
+
mask_block_count = @@tbq.next(4).unpack('L').first
|
88
|
+
mask_block_starts = Array.new
|
89
|
+
mask_block_count.times do
|
90
|
+
mask_block_starts << @@tbq.next(4).unpack('L').first
|
91
|
+
end
|
92
|
+
mask_block_sizes = Array.new
|
93
|
+
mask_block_count.times do
|
94
|
+
mask_block_sizes << @@tbq.next(4).unpack('L').first
|
95
|
+
end
|
96
|
+
@@records[chrom].mask_block_intervals = Array.new
|
97
|
+
mask_block_count.times do |idx|
|
98
|
+
@@records[chrom].mask_block_intervals <<
|
99
|
+
Bio::GenomicInterval.zero_based(chrom,
|
100
|
+
mask_block_starts[idx],
|
101
|
+
mask_block_starts[idx]+mask_block_sizes[idx])
|
102
|
+
end
|
103
|
+
|
104
|
+
@@records[chrom].reserved = @@tbq.next(4).unpack('L').first
|
105
|
+
@@records[chrom].packed_dna_offset = @@tbq.index
|
106
|
+
|
107
|
+
@@records[chrom]
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.find_by_interval(interval)
|
111
|
+
seq = self.find_by_interval_raw(interval)
|
112
|
+
@@records[interval.chrom].n_block_intervals.map do |nb|
|
113
|
+
if interval.overlapped?(nb)
|
114
|
+
case interval.compare(nb)
|
115
|
+
when :equal,:contained_by
|
116
|
+
seq = 'N' * interval.overlap(nb)
|
117
|
+
when :contains
|
118
|
+
left_len = nb.chr_start - interval.chr_start + 1
|
119
|
+
right_len = interval.chr_end - nb.chr_end + 1
|
120
|
+
seq[0, left_len] = 'N' * left_len
|
121
|
+
seq[-right_len, right_len] = 'N' * right_len
|
122
|
+
when :left_overlapped
|
123
|
+
left_len = nb.chr_end - interval.chr_start + 1
|
124
|
+
seq[0, left_len] = 'N' * left_len
|
125
|
+
when :right_overlapped
|
126
|
+
right_len = interval.chr_end - nb.chr_start + 1
|
127
|
+
seq[-right_len, right_len] = 'N' * right_len
|
128
|
+
when :right_adjacent, :right_off
|
129
|
+
# expecting that N-blocks are sorted
|
130
|
+
# return Bio::Sequence::NA.new(seq)
|
131
|
+
seq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
#Bio::Sequence::NA.new(seq)
|
136
|
+
seq
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.find_by_interval_raw(interval)
|
140
|
+
byte_count, byte_mod = interval.zero_start.divmod 4
|
141
|
+
chrom_top = self.records(interval.chrom).packed_dna_offset
|
142
|
+
div_start, mod_start = interval.zero_start.divmod 4
|
143
|
+
div_end, mod_end = interval.zero_end.divmod 4
|
144
|
+
div_len, mod_len = interval.length.divmod 4
|
145
|
+
|
146
|
+
byte_length = div_end - div_start + 1
|
147
|
+
@@tbq.index = chrom_top + div_start
|
148
|
+
bytes = @@tbq.next(byte_length).unpack('C*')
|
149
|
+
seq = Bio::Ucsc::Hg18::ReferenceSequence.bytes_to_nucleotides(bytes)
|
150
|
+
seq[mod_start..(-1-(4-mod_end))]
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.bytes_to_nucleotides(bytes)
|
154
|
+
results = ""
|
155
|
+
bytes.each do |byte|
|
156
|
+
results << Bio::Ucsc::Hg18::ReferenceSequence.byte_to_nucleotides(byte)
|
157
|
+
end
|
158
|
+
results
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.byte_to_nucleotides(byte)
|
162
|
+
BINCODE[byte >> 6] +
|
163
|
+
BINCODE[(byte >> 4) & 0b11] +
|
164
|
+
BINCODE[(byte >> 2) & 0b11] +
|
165
|
+
BINCODE[byte & 0b11]
|
166
|
+
end
|
167
|
+
end # class ReferenceSequence
|
168
|
+
|
169
|
+
end # module Hg18
|
170
|
+
end # module Ucsc
|
171
|
+
end # module Bio
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg18/refgene.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
data/lib/bio-ucsc/hg18/snp130.rb
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg18/snp130.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
data/lib/bio-ucsc/hg19.rb
CHANGED
@@ -16,7 +16,8 @@ module Bio
|
|
16
16
|
|
17
17
|
# Reference sequence retrieval via the 2bit fil
|
18
18
|
#
|
19
|
-
autoload :Reference,
|
19
|
+
autoload :Reference, "#{base}/reference" # OBSOLETE
|
20
|
+
autoload :ReferenceSequence, "#{base}/reference_sequence"
|
20
21
|
|
21
22
|
# group: Mapping and Sequencing Tracks ----------
|
22
23
|
#
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
# Copyright::
|
5
5
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
6
|
-
# Copyright (C)
|
6
|
+
# Copyright (C) 2010 Jan Aerts <jan.aerts@gmail.com>
|
7
7
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
8
8
|
#
|
9
9
|
# = DESCRIPTION
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg19/cytoband.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg19/cytobandideo.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
data/lib/bio-ucsc/hg19/dgv.rb
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg19/dgv.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# = hg19/ensgene.rb
|
3
3
|
# Copyright::
|
4
4
|
# Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
5
|
-
# Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
|
6
5
|
# License:: The Ruby licence (Ryby's / GPLv2 dual)
|
7
6
|
#
|
8
7
|
# = Table desfription in UCSC Table Browser
|
@@ -2,170 +2,11 @@
|
|
2
2
|
# = reference.rb
|
3
3
|
# handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
|
4
4
|
#
|
5
|
+
# In version 0.1.0, this file is OBSOLETE.
|
6
|
+
# Use Ucsc::Hg18::ReferenceSequence instead.
|
7
|
+
#
|
5
8
|
# Copyright:: Cioyrught (C) 2011
|
6
9
|
# MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
7
10
|
# License:: Ruby license (Ryby's / GPLv2 dual)
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
module Bio
|
12
|
-
module Ucsc
|
13
|
-
module Hg19
|
14
|
-
TwoBitHeader =
|
15
|
-
Struct.new(:signature, :version, :sequence_count, :reserved)
|
16
|
-
TwoBitRecord =
|
17
|
-
Struct.new(:dna_size,
|
18
|
-
:n_block_intervals, :mask_block_intervals,
|
19
|
-
:reserved, :packed_dna_offset)
|
20
|
-
|
21
|
-
class ByteQueue
|
22
|
-
def initialize(str)
|
23
|
-
@str = str
|
24
|
-
@index = 0
|
25
|
-
end
|
26
|
-
|
27
|
-
attr_accessor :index
|
28
|
-
|
29
|
-
def next(n)
|
30
|
-
result = @str[@index, n]
|
31
|
-
@index += n
|
32
|
-
result
|
33
|
-
end
|
34
|
-
end # class ByteQueue
|
35
|
-
|
36
|
-
class Reference
|
37
|
-
BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
|
38
|
-
|
39
|
-
cattr_reader :filename, :header, :offsets
|
40
|
-
|
41
|
-
def self.load(filename)
|
42
|
-
two_bit = nil
|
43
|
-
open(filename, 'rb') {|f| two_bit = f.read}
|
44
|
-
@@tbq = ByteQueue.new(two_bit)
|
45
|
-
@@filename = filename
|
46
|
-
|
47
|
-
twobit_header = TwoBitHeader.new
|
48
|
-
twobit_header.signature = @@tbq.next(4).unpack('L').first
|
49
|
-
twobit_header.version = @@tbq.next(4).unpack('L').first
|
50
|
-
twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
|
51
|
-
twobit_header.reserved = @@tbq.next(4).unpack('L').first
|
52
|
-
@@header = twobit_header
|
53
|
-
|
54
|
-
@@offsets = Hash.new
|
55
|
-
@@header.sequence_count.times do
|
56
|
-
name_length = @@tbq.next(1).unpack('C').first
|
57
|
-
@@offsets[@@tbq.next(name_length).unpack('a*').first] =
|
58
|
-
@@tbq.next(4).unpack('L').first
|
59
|
-
end
|
60
|
-
@@records = Hash.new
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.records(chrom)
|
64
|
-
return @@records[chrom] if @@records[chrom]
|
65
|
-
|
66
|
-
@@tbq.index = @@offsets[chrom]
|
67
|
-
@@records[chrom] = TwoBitRecord.new
|
68
|
-
@@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
|
69
|
-
|
70
|
-
n_block_count = @@tbq.next(4).unpack('L').first
|
71
|
-
n_block_starts = Array.new
|
72
|
-
n_block_count.times do
|
73
|
-
n_block_starts << @@tbq.next(4).unpack('L').first
|
74
|
-
end
|
75
|
-
n_block_sizes = Array.new
|
76
|
-
n_block_count.times do
|
77
|
-
n_block_sizes << @@tbq.next(4).unpack('L').first
|
78
|
-
end
|
79
|
-
@@records[chrom].n_block_intervals = Array.new
|
80
|
-
n_block_count.times do |idx|
|
81
|
-
@@records[chrom].n_block_intervals <<
|
82
|
-
Bio::GenomicInterval.zero_based(chrom,
|
83
|
-
n_block_starts[idx],
|
84
|
-
n_block_starts[idx]+n_block_sizes[idx])
|
85
|
-
end
|
86
|
-
|
87
|
-
mask_block_count = @@tbq.next(4).unpack('L').first
|
88
|
-
mask_block_starts = Array.new
|
89
|
-
mask_block_count.times do
|
90
|
-
mask_block_starts << @@tbq.next(4).unpack('L').first
|
91
|
-
end
|
92
|
-
mask_block_sizes = Array.new
|
93
|
-
mask_block_count.times do
|
94
|
-
mask_block_sizes << @@tbq.next(4).unpack('L').first
|
95
|
-
end
|
96
|
-
@@records[chrom].mask_block_intervals = Array.new
|
97
|
-
mask_block_count.times do |idx|
|
98
|
-
@@records[chrom].mask_block_intervals <<
|
99
|
-
Bio::GenomicInterval.zero_based(chrom,
|
100
|
-
mask_block_starts[idx],
|
101
|
-
mask_block_starts[idx]+mask_block_sizes[idx])
|
102
|
-
end
|
103
|
-
|
104
|
-
@@records[chrom].reserved = @@tbq.next(4).unpack('L').first
|
105
|
-
@@records[chrom].packed_dna_offset = @@tbq.index
|
106
|
-
|
107
|
-
@@records[chrom]
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.find_by_interval(interval)
|
111
|
-
seq = self.find_by_interval_raw(interval)
|
112
|
-
@@records[interval.chrom].n_block_intervals.map do |nb|
|
113
|
-
if interval.overlapped?(nb)
|
114
|
-
case interval.compare(nb)
|
115
|
-
when :equal,:contained_by
|
116
|
-
seq = 'N' * interval.overlap(nb)
|
117
|
-
when :contains
|
118
|
-
left_len = nb.chr_start - interval.chr_start + 1
|
119
|
-
right_len = interval.chr_end - nb.chr_end + 1
|
120
|
-
seq[0, left_len] = 'N' * left_len
|
121
|
-
seq[-right_len, right_len] = 'N' * right_len
|
122
|
-
when :left_overlapped
|
123
|
-
left_len = nb.chr_end - interval.chr_start + 1
|
124
|
-
seq[0, left_len] = 'N' * left_len
|
125
|
-
when :right_overlapped
|
126
|
-
right_len = interval.chr_end - nb.chr_start + 1
|
127
|
-
seq[-right_len, right_len] = 'N' * right_len
|
128
|
-
when :right_adjacent, :right_off
|
129
|
-
# expecting that N-blocks are sorted
|
130
|
-
# return Bio::Sequence::NA.new(seq)
|
131
|
-
seq
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
|
-
#Bio::Sequence::NA.new(seq)
|
136
|
-
seq
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.find_by_interval_raw(interval)
|
140
|
-
byte_count, byte_mod = interval.zero_start.divmod 4
|
141
|
-
chrom_top = self.records(interval.chrom).packed_dna_offset
|
142
|
-
div_start, mod_start = interval.zero_start.divmod 4
|
143
|
-
div_end, mod_end = interval.zero_end.divmod 4
|
144
|
-
div_len, mod_len = interval.length.divmod 4
|
145
|
-
|
146
|
-
byte_length = div_end - div_start + 1
|
147
|
-
@@tbq.index = chrom_top + div_start
|
148
|
-
bytes = @@tbq.next(byte_length).unpack('C*')
|
149
|
-
seq = Bio::Ucsc::Hg19::Reference.bytes_to_nucleotides(bytes)
|
150
|
-
seq[mod_start..(-1-(4-mod_end))]
|
151
|
-
end
|
152
|
-
|
153
|
-
def self.bytes_to_nucleotides(bytes)
|
154
|
-
results = ""
|
155
|
-
bytes.each do |byte|
|
156
|
-
results << Bio::Ucsc::Hg19::Reference.byte_to_nucleotides(byte)
|
157
|
-
end
|
158
|
-
results
|
159
|
-
end
|
160
|
-
|
161
|
-
def self.byte_to_nucleotides(byte)
|
162
|
-
BINCODE[byte >> 6] +
|
163
|
-
BINCODE[(byte >> 4) & 0b11] +
|
164
|
-
BINCODE[(byte >> 2) & 0b11] +
|
165
|
-
BINCODE[byte & 0b11]
|
166
|
-
end
|
167
|
-
end # class Reference
|
168
|
-
|
169
|
-
end # module Hg19
|
170
|
-
end # module Ucsc
|
171
|
-
end # module Bio
|
12
|
+
raise "Bio::Ucsc::Hg19::Reference is OBSOLETE. Use Bio::Ucsc::Hg19::ReferenceSequence instead."
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# = reference_sequence.rb
|
3
|
+
# handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
|
4
|
+
#
|
5
|
+
# Copyright:: Cioyrught (C) 2011
|
6
|
+
# MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
7
|
+
# License:: Ruby license (Ryby's / GPLv2 dual)
|
8
|
+
|
9
|
+
# require 'bio'
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module Ucsc
|
13
|
+
module Hg19
|
14
|
+
TwoBitHeader =
|
15
|
+
Struct.new(:signature, :version, :sequence_count, :reserved)
|
16
|
+
TwoBitRecord =
|
17
|
+
Struct.new(:dna_size,
|
18
|
+
:n_block_intervals, :mask_block_intervals,
|
19
|
+
:reserved, :packed_dna_offset)
|
20
|
+
|
21
|
+
class ByteQueue
|
22
|
+
def initialize(str)
|
23
|
+
@str = str
|
24
|
+
@index = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_accessor :index
|
28
|
+
|
29
|
+
def next(n)
|
30
|
+
result = @str[@index, n]
|
31
|
+
@index += n
|
32
|
+
result
|
33
|
+
end
|
34
|
+
end # class ByteQueue
|
35
|
+
|
36
|
+
class ReferenceSequence
|
37
|
+
BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
|
38
|
+
|
39
|
+
cattr_reader :filename, :header, :offsets
|
40
|
+
|
41
|
+
def self.load(filename)
|
42
|
+
two_bit = nil
|
43
|
+
open(filename, 'rb') {|f| two_bit = f.read}
|
44
|
+
@@tbq = ByteQueue.new(two_bit)
|
45
|
+
@@filename = filename
|
46
|
+
|
47
|
+
twobit_header = TwoBitHeader.new
|
48
|
+
twobit_header.signature = @@tbq.next(4).unpack('L').first
|
49
|
+
twobit_header.version = @@tbq.next(4).unpack('L').first
|
50
|
+
twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
|
51
|
+
twobit_header.reserved = @@tbq.next(4).unpack('L').first
|
52
|
+
@@header = twobit_header
|
53
|
+
|
54
|
+
@@offsets = Hash.new
|
55
|
+
@@header.sequence_count.times do
|
56
|
+
name_length = @@tbq.next(1).unpack('C').first
|
57
|
+
@@offsets[@@tbq.next(name_length).unpack('a*').first] =
|
58
|
+
@@tbq.next(4).unpack('L').first
|
59
|
+
end
|
60
|
+
@@records = Hash.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.records(chrom)
|
64
|
+
return @@records[chrom] if @@records[chrom]
|
65
|
+
|
66
|
+
@@tbq.index = @@offsets[chrom]
|
67
|
+
@@records[chrom] = TwoBitRecord.new
|
68
|
+
@@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
|
69
|
+
|
70
|
+
n_block_count = @@tbq.next(4).unpack('L').first
|
71
|
+
n_block_starts = Array.new
|
72
|
+
n_block_count.times do
|
73
|
+
n_block_starts << @@tbq.next(4).unpack('L').first
|
74
|
+
end
|
75
|
+
n_block_sizes = Array.new
|
76
|
+
n_block_count.times do
|
77
|
+
n_block_sizes << @@tbq.next(4).unpack('L').first
|
78
|
+
end
|
79
|
+
@@records[chrom].n_block_intervals = Array.new
|
80
|
+
n_block_count.times do |idx|
|
81
|
+
@@records[chrom].n_block_intervals <<
|
82
|
+
Bio::GenomicInterval.zero_based(chrom,
|
83
|
+
n_block_starts[idx],
|
84
|
+
n_block_starts[idx]+n_block_sizes[idx])
|
85
|
+
end
|
86
|
+
|
87
|
+
mask_block_count = @@tbq.next(4).unpack('L').first
|
88
|
+
mask_block_starts = Array.new
|
89
|
+
mask_block_count.times do
|
90
|
+
mask_block_starts << @@tbq.next(4).unpack('L').first
|
91
|
+
end
|
92
|
+
mask_block_sizes = Array.new
|
93
|
+
mask_block_count.times do
|
94
|
+
mask_block_sizes << @@tbq.next(4).unpack('L').first
|
95
|
+
end
|
96
|
+
@@records[chrom].mask_block_intervals = Array.new
|
97
|
+
mask_block_count.times do |idx|
|
98
|
+
@@records[chrom].mask_block_intervals <<
|
99
|
+
Bio::GenomicInterval.zero_based(chrom,
|
100
|
+
mask_block_starts[idx],
|
101
|
+
mask_block_starts[idx]+mask_block_sizes[idx])
|
102
|
+
end
|
103
|
+
|
104
|
+
@@records[chrom].reserved = @@tbq.next(4).unpack('L').first
|
105
|
+
@@records[chrom].packed_dna_offset = @@tbq.index
|
106
|
+
|
107
|
+
@@records[chrom]
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.find_by_interval(interval)
|
111
|
+
seq = self.find_by_interval_raw(interval)
|
112
|
+
@@records[interval.chrom].n_block_intervals.map do |nb|
|
113
|
+
if interval.overlapped?(nb)
|
114
|
+
case interval.compare(nb)
|
115
|
+
when :equal,:contained_by
|
116
|
+
seq = 'N' * interval.overlap(nb)
|
117
|
+
when :contains
|
118
|
+
left_len = nb.chr_start - interval.chr_start + 1
|
119
|
+
right_len = interval.chr_end - nb.chr_end + 1
|
120
|
+
seq[0, left_len] = 'N' * left_len
|
121
|
+
seq[-right_len, right_len] = 'N' * right_len
|
122
|
+
when :left_overlapped
|
123
|
+
left_len = nb.chr_end - interval.chr_start + 1
|
124
|
+
seq[0, left_len] = 'N' * left_len
|
125
|
+
when :right_overlapped
|
126
|
+
right_len = interval.chr_end - nb.chr_start + 1
|
127
|
+
seq[-right_len, right_len] = 'N' * right_len
|
128
|
+
when :right_adjacent, :right_off
|
129
|
+
# expecting that N-blocks are sorted
|
130
|
+
# return Bio::Sequence::NA.new(seq)
|
131
|
+
seq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
#Bio::Sequence::NA.new(seq)
|
136
|
+
seq
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.find_by_interval_raw(interval)
|
140
|
+
byte_count, byte_mod = interval.zero_start.divmod 4
|
141
|
+
chrom_top = self.records(interval.chrom).packed_dna_offset
|
142
|
+
div_start, mod_start = interval.zero_start.divmod 4
|
143
|
+
div_end, mod_end = interval.zero_end.divmod 4
|
144
|
+
div_len, mod_len = interval.length.divmod 4
|
145
|
+
|
146
|
+
byte_length = div_end - div_start + 1
|
147
|
+
@@tbq.index = chrom_top + div_start
|
148
|
+
bytes = @@tbq.next(byte_length).unpack('C*')
|
149
|
+
seq = Bio::Ucsc::Hg19::ReferenceSequence.bytes_to_nucleotides(bytes)
|
150
|
+
seq[mod_start..(-1-(4-mod_end))]
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.bytes_to_nucleotides(bytes)
|
154
|
+
results = ""
|
155
|
+
bytes.each do |byte|
|
156
|
+
results << Bio::Ucsc::Hg19::ReferenceSequence.byte_to_nucleotides(byte)
|
157
|
+
end
|
158
|
+
results
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.byte_to_nucleotides(byte)
|
162
|
+
BINCODE[byte >> 6] +
|
163
|
+
BINCODE[(byte >> 4) & 0b11] +
|
164
|
+
BINCODE[(byte >> 2) & 0b11] +
|
165
|
+
BINCODE[byte & 0b11]
|
166
|
+
end
|
167
|
+
end # class ReferenceSequence
|
168
|
+
|
169
|
+
end # module Hg19
|
170
|
+
end # module Ucsc
|
171
|
+
end # module Bio
|