bio-ucsc-api 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +9 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/bio-ucsc-api.gemspec +29 -7
- data/lib/bio-ucsc.rb +2 -2
- data/lib/bio-ucsc/hg18.rb +3 -0
- data/lib/bio-ucsc/hg18/reference.rb +171 -0
- data/lib/bio-ucsc/hg19.rb +46 -9
- data/lib/bio-ucsc/hg19/description.rb +23 -0
- data/lib/bio-ucsc/hg19/gbcdnainfo.rb +24 -0
- data/lib/bio-ucsc/hg19/kgxref.rb +19 -0
- data/lib/bio-ucsc/hg19/reference.rb +171 -0
- data/lib/bio-ucsc/hg19/refseqsummary.rb +23 -0
- data/lib/bio-ucsc/hg19/trnas.rb +20 -0
- data/samples/hg19-2bit-retrieve.rb +39 -0
- data/samples/hg19-sample.rb +22 -1
- data/samples/symbol2summary.rb +47 -0
- data/spec/hg18/reference_spec.rb +144 -0
- data/spec/hg19/description_spec.rb +25 -0
- data/spec/hg19/gbcdnainfo_spec.rb +25 -0
- data/spec/hg19/kgxref_spec.rb +14 -0
- data/spec/hg19/reference_spec.rb +137 -0
- data/spec/hg19/refseqsummary_spec.rb +15 -0
- data/spec/hg19/trnas_spec.rb +23 -0
- metadata +29 -56
data/README.rdoc
CHANGED
@@ -23,16 +23,18 @@ http://github.com/misshie/bioruby-ucsc-api/issues
|
|
23
23
|
== Features
|
24
24
|
|
25
25
|
* Designed as a BioRuby plugin
|
26
|
+
* Using ActiveRecord as an O/R mapping framework
|
26
27
|
* Using the Bin index system to improve query performance (see https://github.com/misshie/UCSCBin )
|
27
28
|
* Automatic conversion of "1-based full-closed intervals" to internal "0-based half-closed intervals" (see also bioruby-genomic-interval)
|
28
29
|
* Supporting non-official MySql hosts (e.g. local servers)
|
30
|
+
* NEW: Supporting locally-stored '2bit' files, which can be downloaded from the UCSC site, to retrieve referencial sequence. Now supporting unknown "N" nucleotide blocks, however, "mask-blocks", which are shown in lower-case in UCSC's DNA function, are not supported yet.
|
29
31
|
* Using Rspec for the testing framework
|
30
32
|
* Trying to support whole hg19 and hg18 databases.
|
31
33
|
|
32
|
-
|
33
34
|
== Install
|
34
35
|
|
35
|
-
|
36
|
+
$ gem install bio-ucsc-api
|
37
|
+
(you may need to be root or use "sudo")
|
36
38
|
|
37
39
|
== How to Use
|
38
40
|
|
@@ -48,6 +50,11 @@ http://github.com/misshie/bioruby-ucsc-api/issues
|
|
48
50
|
|
49
51
|
p Bio::Ucsc::Hg19::Snp131.find_by_name("rs56289060")
|
50
52
|
|
53
|
+
# retrieve reference sequence from a locally-stored 2bit file
|
54
|
+
Bio::Ucsc::Hg19::Reference.load("hg19.2bit")
|
55
|
+
itv = Bio::GenomicInterval.parse("chr1:9,500-10,999")
|
56
|
+
p Reference.find_by_interval(itv)
|
57
|
+
|
51
58
|
== Copyright
|
52
59
|
Copyright:: (c) 2011 MISHIMA, Hiroyuki (missy at be.to / hmishima at nagasaki-u.ac.jp)
|
53
60
|
|
data/Rakefile
CHANGED
@@ -28,7 +28,7 @@ Jeweler::Tasks.new do |gem|
|
|
28
28
|
|
29
29
|
gem.add_runtime_dependency 'activerecord', '>= 3.0.0'
|
30
30
|
gem.add_runtime_dependency 'activesupport', '>= 3.0.0'
|
31
|
-
gem.add_runtime_dependency 'bio-genomic-interval', '>= 0.1.
|
31
|
+
gem.add_runtime_dependency 'bio-genomic-interval', '>= 0.1.2'
|
32
32
|
end
|
33
33
|
|
34
34
|
Jeweler::RubygemsDotOrgTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/bio-ucsc-api.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-ucsc-api}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["MISHIMA, Hiroyuki", "Francesco Strozzi", "Jan Aerts"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-25}
|
13
13
|
s.description = %q{A Bioruby plugin: an API for UCSC Genome Browser (experimental)}
|
14
14
|
s.email = %q{missy@be.to}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -36,6 +36,7 @@ Gem::Specification.new do |s|
|
|
36
36
|
"lib/bio-ucsc/hg18/cnpsharp2.rb",
|
37
37
|
"lib/bio-ucsc/hg18/db_connection.rb",
|
38
38
|
"lib/bio-ucsc/hg18/dgv.rb",
|
39
|
+
"lib/bio-ucsc/hg18/reference.rb",
|
39
40
|
"lib/bio-ucsc/hg18/refgene.rb",
|
40
41
|
"lib/bio-ucsc/hg18/rmsk.rb",
|
41
42
|
"lib/bio-ucsc/hg18/tables.rb",
|
@@ -44,8 +45,10 @@ Gem::Specification.new do |s|
|
|
44
45
|
"lib/bio-ucsc/hg19/ccdsgene.rb",
|
45
46
|
"lib/bio-ucsc/hg19/cytoband.rb",
|
46
47
|
"lib/bio-ucsc/hg19/db_connection.rb",
|
48
|
+
"lib/bio-ucsc/hg19/description.rb",
|
47
49
|
"lib/bio-ucsc/hg19/dgv.rb",
|
48
50
|
"lib/bio-ucsc/hg19/ensgene.rb",
|
51
|
+
"lib/bio-ucsc/hg19/gbcdnainfo.rb",
|
49
52
|
"lib/bio-ucsc/hg19/gwascatalog.rb",
|
50
53
|
"lib/bio-ucsc/hg19/hapmapalleleschimp.rb",
|
51
54
|
"lib/bio-ucsc/hg19/hapmapallelesmacaque.rb",
|
@@ -60,11 +63,14 @@ Gem::Specification.new do |s|
|
|
60
63
|
"lib/bio-ucsc/hg19/hapmapsnpsmkk.rb",
|
61
64
|
"lib/bio-ucsc/hg19/hapmapsnpstsi.rb",
|
62
65
|
"lib/bio-ucsc/hg19/hapmapsnpsyri.rb",
|
66
|
+
"lib/bio-ucsc/hg19/kgxref.rb",
|
63
67
|
"lib/bio-ucsc/hg19/knowngene.rb",
|
64
68
|
"lib/bio-ucsc/hg19/omimgene.rb",
|
65
69
|
"lib/bio-ucsc/hg19/phastconselements46wayprimates.rb",
|
66
70
|
"lib/bio-ucsc/hg19/phylop46wayprimates.rb",
|
71
|
+
"lib/bio-ucsc/hg19/reference.rb",
|
67
72
|
"lib/bio-ucsc/hg19/refgene.rb",
|
73
|
+
"lib/bio-ucsc/hg19/refseqsummary.rb",
|
68
74
|
"lib/bio-ucsc/hg19/rmsk.rb",
|
69
75
|
"lib/bio-ucsc/hg19/snp131.rb",
|
70
76
|
"lib/bio-ucsc/hg19/snp132.rb",
|
@@ -72,9 +78,12 @@ Gem::Specification.new do |s|
|
|
72
78
|
"lib/bio-ucsc/hg19/snp132common.rb",
|
73
79
|
"lib/bio-ucsc/hg19/snp132flagged.rb",
|
74
80
|
"lib/bio-ucsc/hg19/snp132mult.rb",
|
81
|
+
"lib/bio-ucsc/hg19/trnas.rb",
|
75
82
|
"lib/bio-ucsc/hg19/wgrna.rb",
|
76
83
|
"lib/bio-ucsc/ucsc_bin.rb",
|
84
|
+
"samples/hg19-2bit-retrieve.rb",
|
77
85
|
"samples/hg19-sample.rb",
|
86
|
+
"samples/symbol2summary.rb",
|
78
87
|
"spec/hg18/cnpiafrate2_spec.rb",
|
79
88
|
"spec/hg18/cnplocke_spec.rb",
|
80
89
|
"spec/hg18/cnpredon_spec.rb",
|
@@ -82,13 +91,16 @@ Gem::Specification.new do |s|
|
|
82
91
|
"spec/hg18/cnpsharp2_spec.rb",
|
83
92
|
"spec/hg18/db_connection_spec.rb",
|
84
93
|
"spec/hg18/dgv_spec.rb",
|
94
|
+
"spec/hg18/reference_spec.rb",
|
85
95
|
"spec/hg18/refgene_spec.rb",
|
86
96
|
"spec/hg18/rmsk_spec.rb",
|
87
97
|
"spec/hg19/ccdsgene_spec.rb",
|
88
98
|
"spec/hg19/cytoband_spec.rb",
|
89
99
|
"spec/hg19/db_connection_spec.rb",
|
100
|
+
"spec/hg19/description_spec.rb",
|
90
101
|
"spec/hg19/dgv_spec.rb",
|
91
102
|
"spec/hg19/ensgene_spec.rb",
|
103
|
+
"spec/hg19/gbcdnainfo_spec.rb",
|
92
104
|
"spec/hg19/gwascatalog_spec.rb",
|
93
105
|
"spec/hg19/hapmapalleleschimp_spec.rb",
|
94
106
|
"spec/hg19/hapmapallelesmacaque_spec.rb",
|
@@ -103,24 +115,28 @@ Gem::Specification.new do |s|
|
|
103
115
|
"spec/hg19/hapmapsnpsmkk_spec.rb",
|
104
116
|
"spec/hg19/hapmapsnpstsi_spec.rb",
|
105
117
|
"spec/hg19/hapmapsnpsyri_spec.rb",
|
118
|
+
"spec/hg19/kgxref_spec.rb",
|
106
119
|
"spec/hg19/knowngene_spec.rb",
|
107
120
|
"spec/hg19/omimGene_spec.rb",
|
108
121
|
"spec/hg19/phastconselements46wayprimates_spec.rb",
|
109
122
|
"spec/hg19/phyloP46wayPrimates_spec.rb",
|
123
|
+
"spec/hg19/reference_spec.rb",
|
110
124
|
"spec/hg19/refgene_spec.rb",
|
125
|
+
"spec/hg19/refseqsummary_spec.rb",
|
111
126
|
"spec/hg19/rmsk_spec.rb",
|
112
127
|
"spec/hg19/snp132Flagged_spec.rb",
|
113
128
|
"spec/hg19/snp132_spec.rb",
|
114
129
|
"spec/hg19/snp132codingdbsnp_spec.rb",
|
115
130
|
"spec/hg19/snp132common_spec.rb",
|
116
131
|
"spec/hg19/snp132mult_spec.rb",
|
132
|
+
"spec/hg19/trnas_spec.rb",
|
117
133
|
"spec/hg19/wgrna_spec.rb",
|
118
134
|
"spec/spec_helper.rb"
|
119
135
|
]
|
120
136
|
s.homepage = %q{http://github.com/misshie/bioruby-ucsc-api}
|
121
137
|
s.licenses = ["Ruby (Ruby's/GPLv2 dual)"]
|
122
138
|
s.require_paths = ["lib"]
|
123
|
-
s.rubygems_version = %q{1.
|
139
|
+
s.rubygems_version = %q{1.7.2}
|
124
140
|
s.summary = %q{A Bioruby plugin: an API for UCSC Genome Browser (experimental)}
|
125
141
|
s.test_files = [
|
126
142
|
"spec/hg18/cnpiafrate2_spec.rb",
|
@@ -130,13 +146,16 @@ Gem::Specification.new do |s|
|
|
130
146
|
"spec/hg18/cnpsharp2_spec.rb",
|
131
147
|
"spec/hg18/db_connection_spec.rb",
|
132
148
|
"spec/hg18/dgv_spec.rb",
|
149
|
+
"spec/hg18/reference_spec.rb",
|
133
150
|
"spec/hg18/refgene_spec.rb",
|
134
151
|
"spec/hg18/rmsk_spec.rb",
|
135
152
|
"spec/hg19/ccdsgene_spec.rb",
|
136
153
|
"spec/hg19/cytoband_spec.rb",
|
137
154
|
"spec/hg19/db_connection_spec.rb",
|
155
|
+
"spec/hg19/description_spec.rb",
|
138
156
|
"spec/hg19/dgv_spec.rb",
|
139
157
|
"spec/hg19/ensgene_spec.rb",
|
158
|
+
"spec/hg19/gbcdnainfo_spec.rb",
|
140
159
|
"spec/hg19/gwascatalog_spec.rb",
|
141
160
|
"spec/hg19/hapmapalleleschimp_spec.rb",
|
142
161
|
"spec/hg19/hapmapallelesmacaque_spec.rb",
|
@@ -151,23 +170,26 @@ Gem::Specification.new do |s|
|
|
151
170
|
"spec/hg19/hapmapsnpsmkk_spec.rb",
|
152
171
|
"spec/hg19/hapmapsnpstsi_spec.rb",
|
153
172
|
"spec/hg19/hapmapsnpsyri_spec.rb",
|
173
|
+
"spec/hg19/kgxref_spec.rb",
|
154
174
|
"spec/hg19/knowngene_spec.rb",
|
155
175
|
"spec/hg19/omimGene_spec.rb",
|
156
176
|
"spec/hg19/phastconselements46wayprimates_spec.rb",
|
157
177
|
"spec/hg19/phyloP46wayPrimates_spec.rb",
|
178
|
+
"spec/hg19/reference_spec.rb",
|
158
179
|
"spec/hg19/refgene_spec.rb",
|
180
|
+
"spec/hg19/refseqsummary_spec.rb",
|
159
181
|
"spec/hg19/rmsk_spec.rb",
|
160
182
|
"spec/hg19/snp132Flagged_spec.rb",
|
161
183
|
"spec/hg19/snp132_spec.rb",
|
162
184
|
"spec/hg19/snp132codingdbsnp_spec.rb",
|
163
185
|
"spec/hg19/snp132common_spec.rb",
|
164
186
|
"spec/hg19/snp132mult_spec.rb",
|
187
|
+
"spec/hg19/trnas_spec.rb",
|
165
188
|
"spec/hg19/wgrna_spec.rb",
|
166
189
|
"spec/spec_helper.rb"
|
167
190
|
]
|
168
191
|
|
169
192
|
if s.respond_to? :specification_version then
|
170
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
171
193
|
s.specification_version = 3
|
172
194
|
|
173
195
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
@@ -181,7 +203,7 @@ Gem::Specification.new do |s|
|
|
181
203
|
s.add_development_dependency(%q<bio>, [">= 1.4.1"])
|
182
204
|
s.add_runtime_dependency(%q<activerecord>, [">= 3.0.0"])
|
183
205
|
s.add_runtime_dependency(%q<activesupport>, [">= 3.0.0"])
|
184
|
-
s.add_runtime_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
206
|
+
s.add_runtime_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
185
207
|
else
|
186
208
|
s.add_dependency(%q<activerecord>, [">= 3.0.7"])
|
187
209
|
s.add_dependency(%q<mysql>, [">= 2.8.1"])
|
@@ -193,7 +215,7 @@ Gem::Specification.new do |s|
|
|
193
215
|
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
194
216
|
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
195
217
|
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
196
|
-
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
218
|
+
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
197
219
|
end
|
198
220
|
else
|
199
221
|
s.add_dependency(%q<activerecord>, [">= 3.0.7"])
|
@@ -206,7 +228,7 @@ Gem::Specification.new do |s|
|
|
206
228
|
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
207
229
|
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
208
230
|
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
209
|
-
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
231
|
+
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
210
232
|
end
|
211
233
|
end
|
212
234
|
|
data/lib/bio-ucsc.rb
CHANGED
@@ -11,8 +11,8 @@ require "bio-genomic-interval"
|
|
11
11
|
module Bio
|
12
12
|
module Ucsc
|
13
13
|
base = File.dirname(__FILE__)
|
14
|
-
autoload :Hg18,
|
15
|
-
autoload :Hg19,
|
14
|
+
autoload :Hg18, "#{base}/bio-ucsc/hg18"
|
15
|
+
autoload :Hg19, "#{base}/bio-ucsc/hg19"
|
16
16
|
|
17
17
|
# ToDo
|
18
18
|
# autoload :Mm9, "#{base}/bio-ucsc/mm9"
|
data/lib/bio-ucsc/hg18.rb
CHANGED
@@ -14,6 +14,9 @@ module Bio
|
|
14
14
|
module Ucsc
|
15
15
|
module Hg18
|
16
16
|
base = File.dirname(__FILE__)
|
17
|
+
# Reference sequence retrieval via the 2bit file
|
18
|
+
autoload :Reference, "#{base}/hg18/reference"
|
19
|
+
|
17
20
|
autoload :RefGene, "#{base}/hg18/refgene"
|
18
21
|
autoload :Dgv, "#{base}/hg18/dgv"
|
19
22
|
autoload :CnpIafrate2, "#{base}/hg18/cnpiafrate2"
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# = reference.rb
|
3
|
+
# handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
|
4
|
+
#
|
5
|
+
# Copyright:: Cioyrught (C) 2011
|
6
|
+
# MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
7
|
+
# License:: Ruby license (Ryby's / GPLv2 dual)
|
8
|
+
|
9
|
+
# require 'bio'
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module Ucsc
|
13
|
+
module Hg18
|
14
|
+
TwoBitHeader =
|
15
|
+
Struct.new(:signature, :version, :sequence_count, :reserved)
|
16
|
+
TwoBitRecord =
|
17
|
+
Struct.new(:dna_size,
|
18
|
+
:n_block_intervals, :mask_block_intervals,
|
19
|
+
:reserved, :packed_dna_offset)
|
20
|
+
|
21
|
+
class ByteQueue
|
22
|
+
def initialize(str)
|
23
|
+
@str = str
|
24
|
+
@index = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_accessor :index
|
28
|
+
|
29
|
+
def next(n)
|
30
|
+
result = @str[@index, n]
|
31
|
+
@index += n
|
32
|
+
result
|
33
|
+
end
|
34
|
+
end # class ByteQueue
|
35
|
+
|
36
|
+
class Reference
|
37
|
+
BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
|
38
|
+
|
39
|
+
cattr_reader :filename, :header, :offsets
|
40
|
+
|
41
|
+
def self.load(filename)
|
42
|
+
two_bit = nil
|
43
|
+
open(filename, 'rb') {|f| two_bit = f.read}
|
44
|
+
@@tbq = ByteQueue.new(two_bit)
|
45
|
+
@@filename = filename
|
46
|
+
|
47
|
+
twobit_header = TwoBitHeader.new
|
48
|
+
twobit_header.signature = @@tbq.next(4).unpack('L').first
|
49
|
+
twobit_header.version = @@tbq.next(4).unpack('L').first
|
50
|
+
twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
|
51
|
+
twobit_header.reserved = @@tbq.next(4).unpack('L').first
|
52
|
+
@@header = twobit_header
|
53
|
+
|
54
|
+
@@offsets = Hash.new
|
55
|
+
@@header.sequence_count.times do
|
56
|
+
name_length = @@tbq.next(1).unpack('C').first
|
57
|
+
@@offsets[@@tbq.next(name_length).unpack('a*').first] =
|
58
|
+
@@tbq.next(4).unpack('L').first
|
59
|
+
end
|
60
|
+
@@records = Hash.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.records(chrom)
|
64
|
+
return @@records[chrom] if @@records[chrom]
|
65
|
+
|
66
|
+
@@tbq.index = @@offsets[chrom]
|
67
|
+
@@records[chrom] = TwoBitRecord.new
|
68
|
+
@@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
|
69
|
+
|
70
|
+
n_block_count = @@tbq.next(4).unpack('L').first
|
71
|
+
n_block_starts = Array.new
|
72
|
+
n_block_count.times do
|
73
|
+
n_block_starts << @@tbq.next(4).unpack('L').first
|
74
|
+
end
|
75
|
+
n_block_sizes = Array.new
|
76
|
+
n_block_count.times do
|
77
|
+
n_block_sizes << @@tbq.next(4).unpack('L').first
|
78
|
+
end
|
79
|
+
@@records[chrom].n_block_intervals = Array.new
|
80
|
+
n_block_count.times do |idx|
|
81
|
+
@@records[chrom].n_block_intervals <<
|
82
|
+
Bio::GenomicInterval.zero_based(chrom,
|
83
|
+
n_block_starts[idx],
|
84
|
+
n_block_starts[idx]+n_block_sizes[idx])
|
85
|
+
end
|
86
|
+
|
87
|
+
mask_block_count = @@tbq.next(4).unpack('L').first
|
88
|
+
mask_block_starts = Array.new
|
89
|
+
mask_block_count.times do
|
90
|
+
mask_block_starts << @@tbq.next(4).unpack('L').first
|
91
|
+
end
|
92
|
+
mask_block_sizes = Array.new
|
93
|
+
mask_block_count.times do
|
94
|
+
mask_block_sizes << @@tbq.next(4).unpack('L').first
|
95
|
+
end
|
96
|
+
@@records[chrom].mask_block_intervals = Array.new
|
97
|
+
mask_block_count.times do |idx|
|
98
|
+
@@records[chrom].mask_block_intervals <<
|
99
|
+
Bio::GenomicInterval.zero_based(chrom,
|
100
|
+
mask_block_starts[idx],
|
101
|
+
mask_block_starts[idx]+mask_block_sizes[idx])
|
102
|
+
end
|
103
|
+
|
104
|
+
@@records[chrom].reserved = @@tbq.next(4).unpack('L').first
|
105
|
+
@@records[chrom].packed_dna_offset = @@tbq.index
|
106
|
+
|
107
|
+
@@records[chrom]
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.find_by_interval(interval)
|
111
|
+
seq = self.find_by_interval_raw(interval)
|
112
|
+
@@records[interval.chrom].n_block_intervals.map do |nb|
|
113
|
+
if interval.overlapped?(nb)
|
114
|
+
case interval.compare(nb)
|
115
|
+
when :equal,:contained_by
|
116
|
+
seq = 'N' * interval.overlap(nb)
|
117
|
+
when :contains
|
118
|
+
left_len = nb.chr_start - interval.chr_start + 1
|
119
|
+
right_len = interval.chr_end - nb.chr_end + 1
|
120
|
+
seq[0, left_len] = 'N' * left_len
|
121
|
+
seq[-right_len, right_len] = 'N' * right_len
|
122
|
+
when :left_overlapped
|
123
|
+
left_len = nb.chr_end - interval.chr_start + 1
|
124
|
+
seq[0, left_len] = 'N' * left_len
|
125
|
+
when :right_overlapped
|
126
|
+
right_len = interval.chr_end - nb.chr_start + 1
|
127
|
+
seq[-right_len, right_len] = 'N' * right_len
|
128
|
+
when :right_adjacent, :right_off
|
129
|
+
# expecting that N-blocks are sorted
|
130
|
+
# return Bio::Sequence::NA.new(seq)
|
131
|
+
seq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
#Bio::Sequence::NA.new(seq)
|
136
|
+
seq
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.find_by_interval_raw(interval)
|
140
|
+
byte_count, byte_mod = interval.zero_start.divmod 4
|
141
|
+
chrom_top = self.records(interval.chrom).packed_dna_offset
|
142
|
+
div_start, mod_start = interval.zero_start.divmod 4
|
143
|
+
div_end, mod_end = interval.zero_end.divmod 4
|
144
|
+
div_len, mod_len = interval.length.divmod 4
|
145
|
+
|
146
|
+
byte_length = div_end - div_start + 1
|
147
|
+
@@tbq.index = chrom_top + div_start
|
148
|
+
bytes = @@tbq.next(byte_length).unpack('C*')
|
149
|
+
seq = Bio::Ucsc::Hg18::Reference.bytes_to_nucleotides(bytes)
|
150
|
+
seq[mod_start..(-1-(4-mod_end))]
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.bytes_to_nucleotides(bytes)
|
154
|
+
results = ""
|
155
|
+
bytes.each do |byte|
|
156
|
+
results << Bio::Ucsc::Hg18::Reference.byte_to_nucleotides(byte)
|
157
|
+
end
|
158
|
+
results
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.byte_to_nucleotides(byte)
|
162
|
+
BINCODE[byte >> 6] +
|
163
|
+
BINCODE[(byte >> 4) & 0b11] +
|
164
|
+
BINCODE[(byte >> 2) & 0b11] +
|
165
|
+
BINCODE[byte & 0b11]
|
166
|
+
end
|
167
|
+
end # class Reference
|
168
|
+
|
169
|
+
end # module Hg18
|
170
|
+
end # module Ucsc
|
171
|
+
end # module Bio
|
data/lib/bio-ucsc/hg19.rb
CHANGED
@@ -13,24 +13,42 @@ module Bio
|
|
13
13
|
module Ucsc
|
14
14
|
module Hg19
|
15
15
|
base = File.dirname(__FILE__)
|
16
|
-
autoload :Dgv, "#{base}/hg19/dgv"
|
17
16
|
|
17
|
+
# Reference sequence retrieval via the 2bit file
|
18
|
+
autoload :Reference, "#{base}/hg19/reference"
|
19
|
+
|
20
|
+
# group: Mapping and Sequencing Tracks
|
21
|
+
## track: Chromosome Band
|
22
|
+
autoload :CytoBand, "#{base}/hg19/cytoband"
|
23
|
+
|
24
|
+
# group: Genes and Gene Prediction Tracks
|
25
|
+
## track: UCSC Genes
|
26
|
+
autoload :KnownGene, "#{base}/hg19/knowngene"
|
27
|
+
|
28
|
+
## track: RefSeq Genes
|
29
|
+
autoload :RefGene, "#{base}/hg19/refgene"
|
30
|
+
|
31
|
+
## track: Ensemble Genes
|
32
|
+
autoload :EnsGene, "#{base}/hg19/ensgene"
|
33
|
+
|
34
|
+
## track: sno/miRNA
|
35
|
+
autoload :WgRna, "#{base}/hg19/wgrna"
|
36
|
+
|
37
|
+
## track: tRNA Genes
|
38
|
+
autoload :TRNAs, "#{base}/hg19/trnas"
|
39
|
+
|
40
|
+
# group: Variation and Repeats
|
41
|
+
## track: SNPs(131)
|
18
42
|
autoload :Snp131, "#{base}/hg19/snp131"
|
19
43
|
|
44
|
+
## tracks: All/Common/Flagged/Mult SNPs(132)
|
20
45
|
autoload :Snp132, "#{base}/hg19/snp132"
|
21
46
|
autoload :Snp132Common, "#{base}/hg19/snp132common"
|
22
47
|
autoload :Snp132Flagged, "#{base}/hg19/snp132flagged"
|
23
48
|
autoload :Snp132Mult, "#{base}/hg19/snp132mult"
|
24
49
|
autoload :Snp132CodingDbSnp, "#{base}/hg19/snp132codingdbsnp"
|
25
50
|
|
26
|
-
|
27
|
-
autoload :RefGene, "#{base}/hg19/refgene"
|
28
|
-
autoload :GwasCatalog, "#{base}/hg19/gwascatalog"
|
29
|
-
autoload :CytoBand, "#{base}/hg19/cytoband"
|
30
|
-
autoload :OmimGene, "#{base}/hg19/omimgene"
|
31
|
-
autoload :WgRna, "#{base}/hg19/wgrna"
|
32
|
-
autoload :EnsGene, "#{base}/hg19/ensgene"
|
33
|
-
|
51
|
+
## track: HapMap SNPs
|
34
52
|
autoload :HapMapSnpsASW, "#{base}/hg19/hapmapsnpsasw"
|
35
53
|
autoload :HapMapSnpsCEU, "#{base}/hg19/hapmapsnpsceu"
|
36
54
|
autoload :HapMapSnpsCHB, "#{base}/hg19/hapmapsnpschb"
|
@@ -45,10 +63,29 @@ module Bio
|
|
45
63
|
autoload :HapMapAllelesChimp, "#{base}/hg19/hapmapalleleschimp"
|
46
64
|
autoload :HapMapAllelesMacaque, "#{base}/hg19/hapmapallelesmacaque"
|
47
65
|
|
66
|
+
## track: RepeatMasker
|
48
67
|
autoload :Rmsk, "#{base}/hg19/rmsk"
|
49
68
|
|
69
|
+
## track: DGV Struct Var
|
70
|
+
autoload :Dgv, "#{base}/hg19/dgv"
|
71
|
+
|
72
|
+
# group: Comparative Genomics
|
73
|
+
## track: Conservation
|
50
74
|
autoload :PhyloP46wayPrimates, "#{base}/hg19/phylop46wayprimates"
|
51
75
|
autoload :PhastConsElements46wayPrimates, "#{base}/hg19/phastconselements46wayprimates"
|
76
|
+
|
77
|
+
# group: Phenotype and Desease Association
|
78
|
+
## track: OMIM Genes
|
79
|
+
autoload :OmimGene, "#{base}/hg19/omimgene"
|
80
|
+
|
81
|
+
## track: :Gwas Catalog
|
82
|
+
autoload :GwasCatalog, "#{base}/hg19/gwascatalog"
|
83
|
+
|
84
|
+
# information tables behind tracks
|
85
|
+
autoload :Description, "#{base}/hg19/description"
|
86
|
+
autoload :GbCdnaInfo, "#{base}/hg19/gbcdnainfo"
|
87
|
+
autoload :KgXref, "#{base}/hg19/kgxref"
|
88
|
+
autoload :RefSeqSummary, "#{base}/hg19/refseqsummary"
|
52
89
|
end
|
53
90
|
end
|
54
91
|
end
|