bio-ucsc-api 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/README.rdoc +36 -16
  2. data/Rakefile +3 -3
  3. data/VERSION +1 -1
  4. data/bio-ucsc-api.gemspec +9 -7
  5. data/lib/bio-ucsc/hg18.rb +2 -4
  6. data/lib/bio-ucsc/hg18/activerecord.rb +1 -1
  7. data/lib/bio-ucsc/hg18/all_bacends.rb +0 -1
  8. data/lib/bio-ucsc/hg18/cnpiafrate2.rb +0 -1
  9. data/lib/bio-ucsc/hg18/cnplocke.rb +0 -1
  10. data/lib/bio-ucsc/hg18/cnpredon.rb +0 -1
  11. data/lib/bio-ucsc/hg18/cnpsebat2.rb +0 -1
  12. data/lib/bio-ucsc/hg18/cnpsharp2.rb +0 -1
  13. data/lib/bio-ucsc/hg18/cnptuzun.rb +0 -1
  14. data/lib/bio-ucsc/hg18/cytoband.rb +0 -1
  15. data/lib/bio-ucsc/hg18/db_connection.rb +1 -1
  16. data/lib/bio-ucsc/hg18/delconrad2.rb +0 -1
  17. data/lib/bio-ucsc/hg18/delhinds2.rb +0 -1
  18. data/lib/bio-ucsc/hg18/delmccarroll.rb +0 -1
  19. data/lib/bio-ucsc/hg18/dgv.rb +0 -1
  20. data/lib/bio-ucsc/hg18/ensgene.rb +0 -1
  21. data/lib/bio-ucsc/hg18/exaptedrepeats.rb +0 -1
  22. data/lib/bio-ucsc/hg18/hgcentral_wikitrack.rb +0 -1
  23. data/lib/bio-ucsc/hg18/kgprotmap2.rb +0 -1
  24. data/lib/bio-ucsc/hg18/kgtargetali.rb +0 -1
  25. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc10.rb +0 -1
  26. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc11.rb +0 -1
  27. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc12.rb +0 -1
  28. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc13.rb +0 -1
  29. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc14.rb +0 -1
  30. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc8.rb +0 -1
  31. data/lib/bio-ucsc/hg18/kiddeichlerdiscabc9.rb +0 -1
  32. data/lib/bio-ucsc/hg18/kiddeichlerdiscg248.rb +0 -1
  33. data/lib/bio-ucsc/hg18/reference.rb +4 -163
  34. data/lib/bio-ucsc/hg18/reference_sequence.rb +171 -0
  35. data/lib/bio-ucsc/hg18/refgene.rb +0 -1
  36. data/lib/bio-ucsc/hg18/snp130.rb +0 -1
  37. data/lib/bio-ucsc/hg19.rb +2 -1
  38. data/lib/bio-ucsc/hg19/activerecord.rb +1 -1
  39. data/lib/bio-ucsc/hg19/cytoband.rb +0 -1
  40. data/lib/bio-ucsc/hg19/cytobandideo.rb +0 -1
  41. data/lib/bio-ucsc/hg19/db_connection.rb +1 -1
  42. data/lib/bio-ucsc/hg19/dgv.rb +0 -1
  43. data/lib/bio-ucsc/hg19/ensgene.rb +0 -1
  44. data/lib/bio-ucsc/hg19/reference.rb +4 -163
  45. data/lib/bio-ucsc/hg19/reference_sequence.rb +171 -0
  46. data/lib/bio-ucsc/hg19/refgene.rb +0 -1
  47. data/lib/bio-ucsc/hg19/snp131.rb +0 -1
  48. data/lib/bio-ucsc/hg19/wgrna.rb +0 -1
  49. data/samples/hg19-2bit-retrieve.rb +2 -2
  50. data/spec/hg18/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
  51. data/spec/hg19/{reference_spec.rb → reference_sequence_spec.rb} +26 -26
  52. metadata +11 -10
@@ -0,0 +1,171 @@
1
+ #
2
+ # = reference_sequence.rb
3
+ # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
4
+ #
5
+ # Copyright:: Cioyrught (C) 2011
6
+ # MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
7
+ # License:: Ruby license (Ryby's / GPLv2 dual)
8
+
9
+ # require 'bio'
10
+
11
+ module Bio
12
+ module Ucsc
13
+ module Hg18
14
+ TwoBitHeader =
15
+ Struct.new(:signature, :version, :sequence_count, :reserved)
16
+ TwoBitRecord =
17
+ Struct.new(:dna_size,
18
+ :n_block_intervals, :mask_block_intervals,
19
+ :reserved, :packed_dna_offset)
20
+
21
+ class ByteQueue
22
+ def initialize(str)
23
+ @str = str
24
+ @index = 0
25
+ end
26
+
27
+ attr_accessor :index
28
+
29
+ def next(n)
30
+ result = @str[@index, n]
31
+ @index += n
32
+ result
33
+ end
34
+ end # class ByteQueue
35
+
36
+ class ReferenceSequence
37
+ BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
38
+
39
+ cattr_reader :filename, :header, :offsets
40
+
41
+ def self.load(filename)
42
+ two_bit = nil
43
+ open(filename, 'rb') {|f| two_bit = f.read}
44
+ @@tbq = ByteQueue.new(two_bit)
45
+ @@filename = filename
46
+
47
+ twobit_header = TwoBitHeader.new
48
+ twobit_header.signature = @@tbq.next(4).unpack('L').first
49
+ twobit_header.version = @@tbq.next(4).unpack('L').first
50
+ twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
51
+ twobit_header.reserved = @@tbq.next(4).unpack('L').first
52
+ @@header = twobit_header
53
+
54
+ @@offsets = Hash.new
55
+ @@header.sequence_count.times do
56
+ name_length = @@tbq.next(1).unpack('C').first
57
+ @@offsets[@@tbq.next(name_length).unpack('a*').first] =
58
+ @@tbq.next(4).unpack('L').first
59
+ end
60
+ @@records = Hash.new
61
+ end
62
+
63
+ def self.records(chrom)
64
+ return @@records[chrom] if @@records[chrom]
65
+
66
+ @@tbq.index = @@offsets[chrom]
67
+ @@records[chrom] = TwoBitRecord.new
68
+ @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
69
+
70
+ n_block_count = @@tbq.next(4).unpack('L').first
71
+ n_block_starts = Array.new
72
+ n_block_count.times do
73
+ n_block_starts << @@tbq.next(4).unpack('L').first
74
+ end
75
+ n_block_sizes = Array.new
76
+ n_block_count.times do
77
+ n_block_sizes << @@tbq.next(4).unpack('L').first
78
+ end
79
+ @@records[chrom].n_block_intervals = Array.new
80
+ n_block_count.times do |idx|
81
+ @@records[chrom].n_block_intervals <<
82
+ Bio::GenomicInterval.zero_based(chrom,
83
+ n_block_starts[idx],
84
+ n_block_starts[idx]+n_block_sizes[idx])
85
+ end
86
+
87
+ mask_block_count = @@tbq.next(4).unpack('L').first
88
+ mask_block_starts = Array.new
89
+ mask_block_count.times do
90
+ mask_block_starts << @@tbq.next(4).unpack('L').first
91
+ end
92
+ mask_block_sizes = Array.new
93
+ mask_block_count.times do
94
+ mask_block_sizes << @@tbq.next(4).unpack('L').first
95
+ end
96
+ @@records[chrom].mask_block_intervals = Array.new
97
+ mask_block_count.times do |idx|
98
+ @@records[chrom].mask_block_intervals <<
99
+ Bio::GenomicInterval.zero_based(chrom,
100
+ mask_block_starts[idx],
101
+ mask_block_starts[idx]+mask_block_sizes[idx])
102
+ end
103
+
104
+ @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
105
+ @@records[chrom].packed_dna_offset = @@tbq.index
106
+
107
+ @@records[chrom]
108
+ end
109
+
110
+ def self.find_by_interval(interval)
111
+ seq = self.find_by_interval_raw(interval)
112
+ @@records[interval.chrom].n_block_intervals.map do |nb|
113
+ if interval.overlapped?(nb)
114
+ case interval.compare(nb)
115
+ when :equal,:contained_by
116
+ seq = 'N' * interval.overlap(nb)
117
+ when :contains
118
+ left_len = nb.chr_start - interval.chr_start + 1
119
+ right_len = interval.chr_end - nb.chr_end + 1
120
+ seq[0, left_len] = 'N' * left_len
121
+ seq[-right_len, right_len] = 'N' * right_len
122
+ when :left_overlapped
123
+ left_len = nb.chr_end - interval.chr_start + 1
124
+ seq[0, left_len] = 'N' * left_len
125
+ when :right_overlapped
126
+ right_len = interval.chr_end - nb.chr_start + 1
127
+ seq[-right_len, right_len] = 'N' * right_len
128
+ when :right_adjacent, :right_off
129
+ # expecting that N-blocks are sorted
130
+ # return Bio::Sequence::NA.new(seq)
131
+ seq
132
+ end
133
+ end
134
+ end
135
+ #Bio::Sequence::NA.new(seq)
136
+ seq
137
+ end
138
+
139
+ def self.find_by_interval_raw(interval)
140
+ byte_count, byte_mod = interval.zero_start.divmod 4
141
+ chrom_top = self.records(interval.chrom).packed_dna_offset
142
+ div_start, mod_start = interval.zero_start.divmod 4
143
+ div_end, mod_end = interval.zero_end.divmod 4
144
+ div_len, mod_len = interval.length.divmod 4
145
+
146
+ byte_length = div_end - div_start + 1
147
+ @@tbq.index = chrom_top + div_start
148
+ bytes = @@tbq.next(byte_length).unpack('C*')
149
+ seq = Bio::Ucsc::Hg18::ReferenceSequence.bytes_to_nucleotides(bytes)
150
+ seq[mod_start..(-1-(4-mod_end))]
151
+ end
152
+
153
+ def self.bytes_to_nucleotides(bytes)
154
+ results = ""
155
+ bytes.each do |byte|
156
+ results << Bio::Ucsc::Hg18::ReferenceSequence.byte_to_nucleotides(byte)
157
+ end
158
+ results
159
+ end
160
+
161
+ def self.byte_to_nucleotides(byte)
162
+ BINCODE[byte >> 6] +
163
+ BINCODE[(byte >> 4) & 0b11] +
164
+ BINCODE[(byte >> 2) & 0b11] +
165
+ BINCODE[byte & 0b11]
166
+ end
167
+ end # class ReferenceSequence
168
+
169
+ end # module Hg18
170
+ end # module Ucsc
171
+ end # module Bio
@@ -2,7 +2,6 @@
2
2
  # = hg18/refgene.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -2,7 +2,6 @@
2
2
  # = hg18/snp130.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -16,7 +16,8 @@ module Bio
16
16
 
17
17
  # Reference sequence retrieval via the 2bit fil
18
18
  #
19
- autoload :Reference, "#{base}/reference"
19
+ autoload :Reference, "#{base}/reference" # OBSOLETE
20
+ autoload :ReferenceSequence, "#{base}/reference_sequence"
20
21
 
21
22
  # group: Mapping and Sequencing Tracks ----------
22
23
  #
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # Copyright::
5
5
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
6
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
+ # Copyright (C) 2010 Jan Aerts <jan.aerts@gmail.com>
7
7
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
8
8
  #
9
9
  # = DESCRIPTION
@@ -2,7 +2,6 @@
2
2
  # = hg19/cytoband.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -2,7 +2,6 @@
2
2
  # = hg19/cytobandideo.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # Copyright:: Cioyrught (C) 2011
5
5
  # MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
6
- # Copyright:: Copyright (C) 2008
6
+ # Copyright:: Copyright (C) 2010
7
7
  # Jan Aerts <jan.aerts@gmail.com>
8
8
  # License:: Ruby licence (Ryby's / GPLv2 dual)
9
9
 
@@ -2,7 +2,6 @@
2
2
  # = hg19/dgv.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -2,7 +2,6 @@
2
2
  # = hg19/ensgene.rb
3
3
  # Copyright::
4
4
  # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
- # Copyright (C) 2008 Jan Aerts <jan.aerts@gmail.com>
6
5
  # License:: The Ruby licence (Ryby's / GPLv2 dual)
7
6
  #
8
7
  # = Table desfription in UCSC Table Browser
@@ -2,170 +2,11 @@
2
2
  # = reference.rb
3
3
  # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
4
4
  #
5
+ # In version 0.1.0, this file is OBSOLETE.
6
+ # Use Ucsc::Hg18::ReferenceSequence instead.
7
+ #
5
8
  # Copyright:: Cioyrught (C) 2011
6
9
  # MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
7
10
  # License:: Ruby license (Ryby's / GPLv2 dual)
8
11
 
9
- # require 'bio'
10
-
11
- module Bio
12
- module Ucsc
13
- module Hg19
14
- TwoBitHeader =
15
- Struct.new(:signature, :version, :sequence_count, :reserved)
16
- TwoBitRecord =
17
- Struct.new(:dna_size,
18
- :n_block_intervals, :mask_block_intervals,
19
- :reserved, :packed_dna_offset)
20
-
21
- class ByteQueue
22
- def initialize(str)
23
- @str = str
24
- @index = 0
25
- end
26
-
27
- attr_accessor :index
28
-
29
- def next(n)
30
- result = @str[@index, n]
31
- @index += n
32
- result
33
- end
34
- end # class ByteQueue
35
-
36
- class Reference
37
- BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
38
-
39
- cattr_reader :filename, :header, :offsets
40
-
41
- def self.load(filename)
42
- two_bit = nil
43
- open(filename, 'rb') {|f| two_bit = f.read}
44
- @@tbq = ByteQueue.new(two_bit)
45
- @@filename = filename
46
-
47
- twobit_header = TwoBitHeader.new
48
- twobit_header.signature = @@tbq.next(4).unpack('L').first
49
- twobit_header.version = @@tbq.next(4).unpack('L').first
50
- twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
51
- twobit_header.reserved = @@tbq.next(4).unpack('L').first
52
- @@header = twobit_header
53
-
54
- @@offsets = Hash.new
55
- @@header.sequence_count.times do
56
- name_length = @@tbq.next(1).unpack('C').first
57
- @@offsets[@@tbq.next(name_length).unpack('a*').first] =
58
- @@tbq.next(4).unpack('L').first
59
- end
60
- @@records = Hash.new
61
- end
62
-
63
- def self.records(chrom)
64
- return @@records[chrom] if @@records[chrom]
65
-
66
- @@tbq.index = @@offsets[chrom]
67
- @@records[chrom] = TwoBitRecord.new
68
- @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
69
-
70
- n_block_count = @@tbq.next(4).unpack('L').first
71
- n_block_starts = Array.new
72
- n_block_count.times do
73
- n_block_starts << @@tbq.next(4).unpack('L').first
74
- end
75
- n_block_sizes = Array.new
76
- n_block_count.times do
77
- n_block_sizes << @@tbq.next(4).unpack('L').first
78
- end
79
- @@records[chrom].n_block_intervals = Array.new
80
- n_block_count.times do |idx|
81
- @@records[chrom].n_block_intervals <<
82
- Bio::GenomicInterval.zero_based(chrom,
83
- n_block_starts[idx],
84
- n_block_starts[idx]+n_block_sizes[idx])
85
- end
86
-
87
- mask_block_count = @@tbq.next(4).unpack('L').first
88
- mask_block_starts = Array.new
89
- mask_block_count.times do
90
- mask_block_starts << @@tbq.next(4).unpack('L').first
91
- end
92
- mask_block_sizes = Array.new
93
- mask_block_count.times do
94
- mask_block_sizes << @@tbq.next(4).unpack('L').first
95
- end
96
- @@records[chrom].mask_block_intervals = Array.new
97
- mask_block_count.times do |idx|
98
- @@records[chrom].mask_block_intervals <<
99
- Bio::GenomicInterval.zero_based(chrom,
100
- mask_block_starts[idx],
101
- mask_block_starts[idx]+mask_block_sizes[idx])
102
- end
103
-
104
- @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
105
- @@records[chrom].packed_dna_offset = @@tbq.index
106
-
107
- @@records[chrom]
108
- end
109
-
110
- def self.find_by_interval(interval)
111
- seq = self.find_by_interval_raw(interval)
112
- @@records[interval.chrom].n_block_intervals.map do |nb|
113
- if interval.overlapped?(nb)
114
- case interval.compare(nb)
115
- when :equal,:contained_by
116
- seq = 'N' * interval.overlap(nb)
117
- when :contains
118
- left_len = nb.chr_start - interval.chr_start + 1
119
- right_len = interval.chr_end - nb.chr_end + 1
120
- seq[0, left_len] = 'N' * left_len
121
- seq[-right_len, right_len] = 'N' * right_len
122
- when :left_overlapped
123
- left_len = nb.chr_end - interval.chr_start + 1
124
- seq[0, left_len] = 'N' * left_len
125
- when :right_overlapped
126
- right_len = interval.chr_end - nb.chr_start + 1
127
- seq[-right_len, right_len] = 'N' * right_len
128
- when :right_adjacent, :right_off
129
- # expecting that N-blocks are sorted
130
- # return Bio::Sequence::NA.new(seq)
131
- seq
132
- end
133
- end
134
- end
135
- #Bio::Sequence::NA.new(seq)
136
- seq
137
- end
138
-
139
- def self.find_by_interval_raw(interval)
140
- byte_count, byte_mod = interval.zero_start.divmod 4
141
- chrom_top = self.records(interval.chrom).packed_dna_offset
142
- div_start, mod_start = interval.zero_start.divmod 4
143
- div_end, mod_end = interval.zero_end.divmod 4
144
- div_len, mod_len = interval.length.divmod 4
145
-
146
- byte_length = div_end - div_start + 1
147
- @@tbq.index = chrom_top + div_start
148
- bytes = @@tbq.next(byte_length).unpack('C*')
149
- seq = Bio::Ucsc::Hg19::Reference.bytes_to_nucleotides(bytes)
150
- seq[mod_start..(-1-(4-mod_end))]
151
- end
152
-
153
- def self.bytes_to_nucleotides(bytes)
154
- results = ""
155
- bytes.each do |byte|
156
- results << Bio::Ucsc::Hg19::Reference.byte_to_nucleotides(byte)
157
- end
158
- results
159
- end
160
-
161
- def self.byte_to_nucleotides(byte)
162
- BINCODE[byte >> 6] +
163
- BINCODE[(byte >> 4) & 0b11] +
164
- BINCODE[(byte >> 2) & 0b11] +
165
- BINCODE[byte & 0b11]
166
- end
167
- end # class Reference
168
-
169
- end # module Hg19
170
- end # module Ucsc
171
- end # module Bio
12
+ raise "Bio::Ucsc::Hg19::Reference is OBSOLETE. Use Bio::Ucsc::Hg19::ReferenceSequence instead."
@@ -0,0 +1,171 @@
1
+ #
2
+ # = reference_sequence.rb
3
+ # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
4
+ #
5
+ # Copyright:: Cioyrught (C) 2011
6
+ # MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
7
+ # License:: Ruby license (Ryby's / GPLv2 dual)
8
+
9
+ # require 'bio'
10
+
11
+ module Bio
12
+ module Ucsc
13
+ module Hg19
14
+ TwoBitHeader =
15
+ Struct.new(:signature, :version, :sequence_count, :reserved)
16
+ TwoBitRecord =
17
+ Struct.new(:dna_size,
18
+ :n_block_intervals, :mask_block_intervals,
19
+ :reserved, :packed_dna_offset)
20
+
21
+ class ByteQueue
22
+ def initialize(str)
23
+ @str = str
24
+ @index = 0
25
+ end
26
+
27
+ attr_accessor :index
28
+
29
+ def next(n)
30
+ result = @str[@index, n]
31
+ @index += n
32
+ result
33
+ end
34
+ end # class ByteQueue
35
+
36
+ class ReferenceSequence
37
+ BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
38
+
39
+ cattr_reader :filename, :header, :offsets
40
+
41
+ def self.load(filename)
42
+ two_bit = nil
43
+ open(filename, 'rb') {|f| two_bit = f.read}
44
+ @@tbq = ByteQueue.new(two_bit)
45
+ @@filename = filename
46
+
47
+ twobit_header = TwoBitHeader.new
48
+ twobit_header.signature = @@tbq.next(4).unpack('L').first
49
+ twobit_header.version = @@tbq.next(4).unpack('L').first
50
+ twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
51
+ twobit_header.reserved = @@tbq.next(4).unpack('L').first
52
+ @@header = twobit_header
53
+
54
+ @@offsets = Hash.new
55
+ @@header.sequence_count.times do
56
+ name_length = @@tbq.next(1).unpack('C').first
57
+ @@offsets[@@tbq.next(name_length).unpack('a*').first] =
58
+ @@tbq.next(4).unpack('L').first
59
+ end
60
+ @@records = Hash.new
61
+ end
62
+
63
+ def self.records(chrom)
64
+ return @@records[chrom] if @@records[chrom]
65
+
66
+ @@tbq.index = @@offsets[chrom]
67
+ @@records[chrom] = TwoBitRecord.new
68
+ @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
69
+
70
+ n_block_count = @@tbq.next(4).unpack('L').first
71
+ n_block_starts = Array.new
72
+ n_block_count.times do
73
+ n_block_starts << @@tbq.next(4).unpack('L').first
74
+ end
75
+ n_block_sizes = Array.new
76
+ n_block_count.times do
77
+ n_block_sizes << @@tbq.next(4).unpack('L').first
78
+ end
79
+ @@records[chrom].n_block_intervals = Array.new
80
+ n_block_count.times do |idx|
81
+ @@records[chrom].n_block_intervals <<
82
+ Bio::GenomicInterval.zero_based(chrom,
83
+ n_block_starts[idx],
84
+ n_block_starts[idx]+n_block_sizes[idx])
85
+ end
86
+
87
+ mask_block_count = @@tbq.next(4).unpack('L').first
88
+ mask_block_starts = Array.new
89
+ mask_block_count.times do
90
+ mask_block_starts << @@tbq.next(4).unpack('L').first
91
+ end
92
+ mask_block_sizes = Array.new
93
+ mask_block_count.times do
94
+ mask_block_sizes << @@tbq.next(4).unpack('L').first
95
+ end
96
+ @@records[chrom].mask_block_intervals = Array.new
97
+ mask_block_count.times do |idx|
98
+ @@records[chrom].mask_block_intervals <<
99
+ Bio::GenomicInterval.zero_based(chrom,
100
+ mask_block_starts[idx],
101
+ mask_block_starts[idx]+mask_block_sizes[idx])
102
+ end
103
+
104
+ @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
105
+ @@records[chrom].packed_dna_offset = @@tbq.index
106
+
107
+ @@records[chrom]
108
+ end
109
+
110
+ def self.find_by_interval(interval)
111
+ seq = self.find_by_interval_raw(interval)
112
+ @@records[interval.chrom].n_block_intervals.map do |nb|
113
+ if interval.overlapped?(nb)
114
+ case interval.compare(nb)
115
+ when :equal,:contained_by
116
+ seq = 'N' * interval.overlap(nb)
117
+ when :contains
118
+ left_len = nb.chr_start - interval.chr_start + 1
119
+ right_len = interval.chr_end - nb.chr_end + 1
120
+ seq[0, left_len] = 'N' * left_len
121
+ seq[-right_len, right_len] = 'N' * right_len
122
+ when :left_overlapped
123
+ left_len = nb.chr_end - interval.chr_start + 1
124
+ seq[0, left_len] = 'N' * left_len
125
+ when :right_overlapped
126
+ right_len = interval.chr_end - nb.chr_start + 1
127
+ seq[-right_len, right_len] = 'N' * right_len
128
+ when :right_adjacent, :right_off
129
+ # expecting that N-blocks are sorted
130
+ # return Bio::Sequence::NA.new(seq)
131
+ seq
132
+ end
133
+ end
134
+ end
135
+ #Bio::Sequence::NA.new(seq)
136
+ seq
137
+ end
138
+
139
+ def self.find_by_interval_raw(interval)
140
+ byte_count, byte_mod = interval.zero_start.divmod 4
141
+ chrom_top = self.records(interval.chrom).packed_dna_offset
142
+ div_start, mod_start = interval.zero_start.divmod 4
143
+ div_end, mod_end = interval.zero_end.divmod 4
144
+ div_len, mod_len = interval.length.divmod 4
145
+
146
+ byte_length = div_end - div_start + 1
147
+ @@tbq.index = chrom_top + div_start
148
+ bytes = @@tbq.next(byte_length).unpack('C*')
149
+ seq = Bio::Ucsc::Hg19::ReferenceSequence.bytes_to_nucleotides(bytes)
150
+ seq[mod_start..(-1-(4-mod_end))]
151
+ end
152
+
153
+ def self.bytes_to_nucleotides(bytes)
154
+ results = ""
155
+ bytes.each do |byte|
156
+ results << Bio::Ucsc::Hg19::ReferenceSequence.byte_to_nucleotides(byte)
157
+ end
158
+ results
159
+ end
160
+
161
+ def self.byte_to_nucleotides(byte)
162
+ BINCODE[byte >> 6] +
163
+ BINCODE[(byte >> 4) & 0b11] +
164
+ BINCODE[(byte >> 2) & 0b11] +
165
+ BINCODE[byte & 0b11]
166
+ end
167
+ end # class ReferenceSequence
168
+
169
+ end # module Hg19
170
+ end # module Ucsc
171
+ end # module Bio