bio-ucsc-api 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +9 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/bio-ucsc-api.gemspec +29 -7
- data/lib/bio-ucsc.rb +2 -2
- data/lib/bio-ucsc/hg18.rb +3 -0
- data/lib/bio-ucsc/hg18/reference.rb +171 -0
- data/lib/bio-ucsc/hg19.rb +46 -9
- data/lib/bio-ucsc/hg19/description.rb +23 -0
- data/lib/bio-ucsc/hg19/gbcdnainfo.rb +24 -0
- data/lib/bio-ucsc/hg19/kgxref.rb +19 -0
- data/lib/bio-ucsc/hg19/reference.rb +171 -0
- data/lib/bio-ucsc/hg19/refseqsummary.rb +23 -0
- data/lib/bio-ucsc/hg19/trnas.rb +20 -0
- data/samples/hg19-2bit-retrieve.rb +39 -0
- data/samples/hg19-sample.rb +22 -1
- data/samples/symbol2summary.rb +47 -0
- data/spec/hg18/reference_spec.rb +144 -0
- data/spec/hg19/description_spec.rb +25 -0
- data/spec/hg19/gbcdnainfo_spec.rb +25 -0
- data/spec/hg19/kgxref_spec.rb +14 -0
- data/spec/hg19/reference_spec.rb +137 -0
- data/spec/hg19/refseqsummary_spec.rb +15 -0
- data/spec/hg19/trnas_spec.rb +23 -0
- metadata +29 -56
data/README.rdoc
CHANGED
@@ -23,16 +23,18 @@ http://github.com/misshie/bioruby-ucsc-api/issues
|
|
23
23
|
== Features
|
24
24
|
|
25
25
|
* Designed as a BioRuby plugin
|
26
|
+
* Using ActiveRecord as an O/R mapping framework
|
26
27
|
* Using the Bin index system to improve query performance (see https://github.com/misshie/UCSCBin )
|
27
28
|
* Automatic conversion of "1-based full-closed intervals" to internal "0-based half-closed intervals" (see also bioruby-genomic-interval)
|
28
29
|
* Supporting non-official MySql hosts (e.g. local servers)
|
30
|
+
* NEW: Supporting locally-stored '2bit' files, which can be downloaded from the UCSC site, to retrieve referencial sequence. Now supporting unknown "N" nucleotide blocks, however, "mask-blocks", which are shown in lower-case in UCSC's DNA function, are not supported yet.
|
29
31
|
* Using Rspec for the testing framework
|
30
32
|
* Trying to support whole hg19 and hg18 databases.
|
31
33
|
|
32
|
-
|
33
34
|
== Install
|
34
35
|
|
35
|
-
|
36
|
+
$ gem install bio-ucsc-api
|
37
|
+
(you may need to be root or use "sudo")
|
36
38
|
|
37
39
|
== How to Use
|
38
40
|
|
@@ -48,6 +50,11 @@ http://github.com/misshie/bioruby-ucsc-api/issues
|
|
48
50
|
|
49
51
|
p Bio::Ucsc::Hg19::Snp131.find_by_name("rs56289060")
|
50
52
|
|
53
|
+
# retrieve reference sequence from a locally-stored 2bit file
|
54
|
+
Bio::Ucsc::Hg19::Reference.load("hg19.2bit")
|
55
|
+
itv = Bio::GenomicInterval.parse("chr1:9,500-10,999")
|
56
|
+
p Reference.find_by_interval(itv)
|
57
|
+
|
51
58
|
== Copyright
|
52
59
|
Copyright:: (c) 2011 MISHIMA, Hiroyuki (missy at be.to / hmishima at nagasaki-u.ac.jp)
|
53
60
|
|
data/Rakefile
CHANGED
@@ -28,7 +28,7 @@ Jeweler::Tasks.new do |gem|
|
|
28
28
|
|
29
29
|
gem.add_runtime_dependency 'activerecord', '>= 3.0.0'
|
30
30
|
gem.add_runtime_dependency 'activesupport', '>= 3.0.0'
|
31
|
-
gem.add_runtime_dependency 'bio-genomic-interval', '>= 0.1.
|
31
|
+
gem.add_runtime_dependency 'bio-genomic-interval', '>= 0.1.2'
|
32
32
|
end
|
33
33
|
|
34
34
|
Jeweler::RubygemsDotOrgTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/bio-ucsc-api.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-ucsc-api}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["MISHIMA, Hiroyuki", "Francesco Strozzi", "Jan Aerts"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-25}
|
13
13
|
s.description = %q{A Bioruby plugin: an API for UCSC Genome Browser (experimental)}
|
14
14
|
s.email = %q{missy@be.to}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -36,6 +36,7 @@ Gem::Specification.new do |s|
|
|
36
36
|
"lib/bio-ucsc/hg18/cnpsharp2.rb",
|
37
37
|
"lib/bio-ucsc/hg18/db_connection.rb",
|
38
38
|
"lib/bio-ucsc/hg18/dgv.rb",
|
39
|
+
"lib/bio-ucsc/hg18/reference.rb",
|
39
40
|
"lib/bio-ucsc/hg18/refgene.rb",
|
40
41
|
"lib/bio-ucsc/hg18/rmsk.rb",
|
41
42
|
"lib/bio-ucsc/hg18/tables.rb",
|
@@ -44,8 +45,10 @@ Gem::Specification.new do |s|
|
|
44
45
|
"lib/bio-ucsc/hg19/ccdsgene.rb",
|
45
46
|
"lib/bio-ucsc/hg19/cytoband.rb",
|
46
47
|
"lib/bio-ucsc/hg19/db_connection.rb",
|
48
|
+
"lib/bio-ucsc/hg19/description.rb",
|
47
49
|
"lib/bio-ucsc/hg19/dgv.rb",
|
48
50
|
"lib/bio-ucsc/hg19/ensgene.rb",
|
51
|
+
"lib/bio-ucsc/hg19/gbcdnainfo.rb",
|
49
52
|
"lib/bio-ucsc/hg19/gwascatalog.rb",
|
50
53
|
"lib/bio-ucsc/hg19/hapmapalleleschimp.rb",
|
51
54
|
"lib/bio-ucsc/hg19/hapmapallelesmacaque.rb",
|
@@ -60,11 +63,14 @@ Gem::Specification.new do |s|
|
|
60
63
|
"lib/bio-ucsc/hg19/hapmapsnpsmkk.rb",
|
61
64
|
"lib/bio-ucsc/hg19/hapmapsnpstsi.rb",
|
62
65
|
"lib/bio-ucsc/hg19/hapmapsnpsyri.rb",
|
66
|
+
"lib/bio-ucsc/hg19/kgxref.rb",
|
63
67
|
"lib/bio-ucsc/hg19/knowngene.rb",
|
64
68
|
"lib/bio-ucsc/hg19/omimgene.rb",
|
65
69
|
"lib/bio-ucsc/hg19/phastconselements46wayprimates.rb",
|
66
70
|
"lib/bio-ucsc/hg19/phylop46wayprimates.rb",
|
71
|
+
"lib/bio-ucsc/hg19/reference.rb",
|
67
72
|
"lib/bio-ucsc/hg19/refgene.rb",
|
73
|
+
"lib/bio-ucsc/hg19/refseqsummary.rb",
|
68
74
|
"lib/bio-ucsc/hg19/rmsk.rb",
|
69
75
|
"lib/bio-ucsc/hg19/snp131.rb",
|
70
76
|
"lib/bio-ucsc/hg19/snp132.rb",
|
@@ -72,9 +78,12 @@ Gem::Specification.new do |s|
|
|
72
78
|
"lib/bio-ucsc/hg19/snp132common.rb",
|
73
79
|
"lib/bio-ucsc/hg19/snp132flagged.rb",
|
74
80
|
"lib/bio-ucsc/hg19/snp132mult.rb",
|
81
|
+
"lib/bio-ucsc/hg19/trnas.rb",
|
75
82
|
"lib/bio-ucsc/hg19/wgrna.rb",
|
76
83
|
"lib/bio-ucsc/ucsc_bin.rb",
|
84
|
+
"samples/hg19-2bit-retrieve.rb",
|
77
85
|
"samples/hg19-sample.rb",
|
86
|
+
"samples/symbol2summary.rb",
|
78
87
|
"spec/hg18/cnpiafrate2_spec.rb",
|
79
88
|
"spec/hg18/cnplocke_spec.rb",
|
80
89
|
"spec/hg18/cnpredon_spec.rb",
|
@@ -82,13 +91,16 @@ Gem::Specification.new do |s|
|
|
82
91
|
"spec/hg18/cnpsharp2_spec.rb",
|
83
92
|
"spec/hg18/db_connection_spec.rb",
|
84
93
|
"spec/hg18/dgv_spec.rb",
|
94
|
+
"spec/hg18/reference_spec.rb",
|
85
95
|
"spec/hg18/refgene_spec.rb",
|
86
96
|
"spec/hg18/rmsk_spec.rb",
|
87
97
|
"spec/hg19/ccdsgene_spec.rb",
|
88
98
|
"spec/hg19/cytoband_spec.rb",
|
89
99
|
"spec/hg19/db_connection_spec.rb",
|
100
|
+
"spec/hg19/description_spec.rb",
|
90
101
|
"spec/hg19/dgv_spec.rb",
|
91
102
|
"spec/hg19/ensgene_spec.rb",
|
103
|
+
"spec/hg19/gbcdnainfo_spec.rb",
|
92
104
|
"spec/hg19/gwascatalog_spec.rb",
|
93
105
|
"spec/hg19/hapmapalleleschimp_spec.rb",
|
94
106
|
"spec/hg19/hapmapallelesmacaque_spec.rb",
|
@@ -103,24 +115,28 @@ Gem::Specification.new do |s|
|
|
103
115
|
"spec/hg19/hapmapsnpsmkk_spec.rb",
|
104
116
|
"spec/hg19/hapmapsnpstsi_spec.rb",
|
105
117
|
"spec/hg19/hapmapsnpsyri_spec.rb",
|
118
|
+
"spec/hg19/kgxref_spec.rb",
|
106
119
|
"spec/hg19/knowngene_spec.rb",
|
107
120
|
"spec/hg19/omimGene_spec.rb",
|
108
121
|
"spec/hg19/phastconselements46wayprimates_spec.rb",
|
109
122
|
"spec/hg19/phyloP46wayPrimates_spec.rb",
|
123
|
+
"spec/hg19/reference_spec.rb",
|
110
124
|
"spec/hg19/refgene_spec.rb",
|
125
|
+
"spec/hg19/refseqsummary_spec.rb",
|
111
126
|
"spec/hg19/rmsk_spec.rb",
|
112
127
|
"spec/hg19/snp132Flagged_spec.rb",
|
113
128
|
"spec/hg19/snp132_spec.rb",
|
114
129
|
"spec/hg19/snp132codingdbsnp_spec.rb",
|
115
130
|
"spec/hg19/snp132common_spec.rb",
|
116
131
|
"spec/hg19/snp132mult_spec.rb",
|
132
|
+
"spec/hg19/trnas_spec.rb",
|
117
133
|
"spec/hg19/wgrna_spec.rb",
|
118
134
|
"spec/spec_helper.rb"
|
119
135
|
]
|
120
136
|
s.homepage = %q{http://github.com/misshie/bioruby-ucsc-api}
|
121
137
|
s.licenses = ["Ruby (Ruby's/GPLv2 dual)"]
|
122
138
|
s.require_paths = ["lib"]
|
123
|
-
s.rubygems_version = %q{1.
|
139
|
+
s.rubygems_version = %q{1.7.2}
|
124
140
|
s.summary = %q{A Bioruby plugin: an API for UCSC Genome Browser (experimental)}
|
125
141
|
s.test_files = [
|
126
142
|
"spec/hg18/cnpiafrate2_spec.rb",
|
@@ -130,13 +146,16 @@ Gem::Specification.new do |s|
|
|
130
146
|
"spec/hg18/cnpsharp2_spec.rb",
|
131
147
|
"spec/hg18/db_connection_spec.rb",
|
132
148
|
"spec/hg18/dgv_spec.rb",
|
149
|
+
"spec/hg18/reference_spec.rb",
|
133
150
|
"spec/hg18/refgene_spec.rb",
|
134
151
|
"spec/hg18/rmsk_spec.rb",
|
135
152
|
"spec/hg19/ccdsgene_spec.rb",
|
136
153
|
"spec/hg19/cytoband_spec.rb",
|
137
154
|
"spec/hg19/db_connection_spec.rb",
|
155
|
+
"spec/hg19/description_spec.rb",
|
138
156
|
"spec/hg19/dgv_spec.rb",
|
139
157
|
"spec/hg19/ensgene_spec.rb",
|
158
|
+
"spec/hg19/gbcdnainfo_spec.rb",
|
140
159
|
"spec/hg19/gwascatalog_spec.rb",
|
141
160
|
"spec/hg19/hapmapalleleschimp_spec.rb",
|
142
161
|
"spec/hg19/hapmapallelesmacaque_spec.rb",
|
@@ -151,23 +170,26 @@ Gem::Specification.new do |s|
|
|
151
170
|
"spec/hg19/hapmapsnpsmkk_spec.rb",
|
152
171
|
"spec/hg19/hapmapsnpstsi_spec.rb",
|
153
172
|
"spec/hg19/hapmapsnpsyri_spec.rb",
|
173
|
+
"spec/hg19/kgxref_spec.rb",
|
154
174
|
"spec/hg19/knowngene_spec.rb",
|
155
175
|
"spec/hg19/omimGene_spec.rb",
|
156
176
|
"spec/hg19/phastconselements46wayprimates_spec.rb",
|
157
177
|
"spec/hg19/phyloP46wayPrimates_spec.rb",
|
178
|
+
"spec/hg19/reference_spec.rb",
|
158
179
|
"spec/hg19/refgene_spec.rb",
|
180
|
+
"spec/hg19/refseqsummary_spec.rb",
|
159
181
|
"spec/hg19/rmsk_spec.rb",
|
160
182
|
"spec/hg19/snp132Flagged_spec.rb",
|
161
183
|
"spec/hg19/snp132_spec.rb",
|
162
184
|
"spec/hg19/snp132codingdbsnp_spec.rb",
|
163
185
|
"spec/hg19/snp132common_spec.rb",
|
164
186
|
"spec/hg19/snp132mult_spec.rb",
|
187
|
+
"spec/hg19/trnas_spec.rb",
|
165
188
|
"spec/hg19/wgrna_spec.rb",
|
166
189
|
"spec/spec_helper.rb"
|
167
190
|
]
|
168
191
|
|
169
192
|
if s.respond_to? :specification_version then
|
170
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
171
193
|
s.specification_version = 3
|
172
194
|
|
173
195
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
@@ -181,7 +203,7 @@ Gem::Specification.new do |s|
|
|
181
203
|
s.add_development_dependency(%q<bio>, [">= 1.4.1"])
|
182
204
|
s.add_runtime_dependency(%q<activerecord>, [">= 3.0.0"])
|
183
205
|
s.add_runtime_dependency(%q<activesupport>, [">= 3.0.0"])
|
184
|
-
s.add_runtime_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
206
|
+
s.add_runtime_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
185
207
|
else
|
186
208
|
s.add_dependency(%q<activerecord>, [">= 3.0.7"])
|
187
209
|
s.add_dependency(%q<mysql>, [">= 2.8.1"])
|
@@ -193,7 +215,7 @@ Gem::Specification.new do |s|
|
|
193
215
|
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
194
216
|
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
195
217
|
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
196
|
-
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
218
|
+
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
197
219
|
end
|
198
220
|
else
|
199
221
|
s.add_dependency(%q<activerecord>, [">= 3.0.7"])
|
@@ -206,7 +228,7 @@ Gem::Specification.new do |s|
|
|
206
228
|
s.add_dependency(%q<bio>, [">= 1.4.1"])
|
207
229
|
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
208
230
|
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
209
|
-
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.
|
231
|
+
s.add_dependency(%q<bio-genomic-interval>, [">= 0.1.2"])
|
210
232
|
end
|
211
233
|
end
|
212
234
|
|
data/lib/bio-ucsc.rb
CHANGED
@@ -11,8 +11,8 @@ require "bio-genomic-interval"
|
|
11
11
|
module Bio
|
12
12
|
module Ucsc
|
13
13
|
base = File.dirname(__FILE__)
|
14
|
-
autoload :Hg18,
|
15
|
-
autoload :Hg19,
|
14
|
+
autoload :Hg18, "#{base}/bio-ucsc/hg18"
|
15
|
+
autoload :Hg19, "#{base}/bio-ucsc/hg19"
|
16
16
|
|
17
17
|
# ToDo
|
18
18
|
# autoload :Mm9, "#{base}/bio-ucsc/mm9"
|
data/lib/bio-ucsc/hg18.rb
CHANGED
@@ -14,6 +14,9 @@ module Bio
|
|
14
14
|
module Ucsc
|
15
15
|
module Hg18
|
16
16
|
base = File.dirname(__FILE__)
|
17
|
+
# Reference sequence retrieval via the 2bit file
|
18
|
+
autoload :Reference, "#{base}/hg18/reference"
|
19
|
+
|
17
20
|
autoload :RefGene, "#{base}/hg18/refgene"
|
18
21
|
autoload :Dgv, "#{base}/hg18/dgv"
|
19
22
|
autoload :CnpIafrate2, "#{base}/hg18/cnpiafrate2"
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# = reference.rb
|
3
|
+
# handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
|
4
|
+
#
|
5
|
+
# Copyright:: Cioyrught (C) 2011
|
6
|
+
# MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
|
7
|
+
# License:: Ruby license (Ryby's / GPLv2 dual)
|
8
|
+
|
9
|
+
# require 'bio'
|
10
|
+
|
11
|
+
module Bio
|
12
|
+
module Ucsc
|
13
|
+
module Hg18
|
14
|
+
TwoBitHeader =
|
15
|
+
Struct.new(:signature, :version, :sequence_count, :reserved)
|
16
|
+
TwoBitRecord =
|
17
|
+
Struct.new(:dna_size,
|
18
|
+
:n_block_intervals, :mask_block_intervals,
|
19
|
+
:reserved, :packed_dna_offset)
|
20
|
+
|
21
|
+
class ByteQueue
|
22
|
+
def initialize(str)
|
23
|
+
@str = str
|
24
|
+
@index = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_accessor :index
|
28
|
+
|
29
|
+
def next(n)
|
30
|
+
result = @str[@index, n]
|
31
|
+
@index += n
|
32
|
+
result
|
33
|
+
end
|
34
|
+
end # class ByteQueue
|
35
|
+
|
36
|
+
class Reference
|
37
|
+
BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
|
38
|
+
|
39
|
+
cattr_reader :filename, :header, :offsets
|
40
|
+
|
41
|
+
def self.load(filename)
|
42
|
+
two_bit = nil
|
43
|
+
open(filename, 'rb') {|f| two_bit = f.read}
|
44
|
+
@@tbq = ByteQueue.new(two_bit)
|
45
|
+
@@filename = filename
|
46
|
+
|
47
|
+
twobit_header = TwoBitHeader.new
|
48
|
+
twobit_header.signature = @@tbq.next(4).unpack('L').first
|
49
|
+
twobit_header.version = @@tbq.next(4).unpack('L').first
|
50
|
+
twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
|
51
|
+
twobit_header.reserved = @@tbq.next(4).unpack('L').first
|
52
|
+
@@header = twobit_header
|
53
|
+
|
54
|
+
@@offsets = Hash.new
|
55
|
+
@@header.sequence_count.times do
|
56
|
+
name_length = @@tbq.next(1).unpack('C').first
|
57
|
+
@@offsets[@@tbq.next(name_length).unpack('a*').first] =
|
58
|
+
@@tbq.next(4).unpack('L').first
|
59
|
+
end
|
60
|
+
@@records = Hash.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.records(chrom)
|
64
|
+
return @@records[chrom] if @@records[chrom]
|
65
|
+
|
66
|
+
@@tbq.index = @@offsets[chrom]
|
67
|
+
@@records[chrom] = TwoBitRecord.new
|
68
|
+
@@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
|
69
|
+
|
70
|
+
n_block_count = @@tbq.next(4).unpack('L').first
|
71
|
+
n_block_starts = Array.new
|
72
|
+
n_block_count.times do
|
73
|
+
n_block_starts << @@tbq.next(4).unpack('L').first
|
74
|
+
end
|
75
|
+
n_block_sizes = Array.new
|
76
|
+
n_block_count.times do
|
77
|
+
n_block_sizes << @@tbq.next(4).unpack('L').first
|
78
|
+
end
|
79
|
+
@@records[chrom].n_block_intervals = Array.new
|
80
|
+
n_block_count.times do |idx|
|
81
|
+
@@records[chrom].n_block_intervals <<
|
82
|
+
Bio::GenomicInterval.zero_based(chrom,
|
83
|
+
n_block_starts[idx],
|
84
|
+
n_block_starts[idx]+n_block_sizes[idx])
|
85
|
+
end
|
86
|
+
|
87
|
+
mask_block_count = @@tbq.next(4).unpack('L').first
|
88
|
+
mask_block_starts = Array.new
|
89
|
+
mask_block_count.times do
|
90
|
+
mask_block_starts << @@tbq.next(4).unpack('L').first
|
91
|
+
end
|
92
|
+
mask_block_sizes = Array.new
|
93
|
+
mask_block_count.times do
|
94
|
+
mask_block_sizes << @@tbq.next(4).unpack('L').first
|
95
|
+
end
|
96
|
+
@@records[chrom].mask_block_intervals = Array.new
|
97
|
+
mask_block_count.times do |idx|
|
98
|
+
@@records[chrom].mask_block_intervals <<
|
99
|
+
Bio::GenomicInterval.zero_based(chrom,
|
100
|
+
mask_block_starts[idx],
|
101
|
+
mask_block_starts[idx]+mask_block_sizes[idx])
|
102
|
+
end
|
103
|
+
|
104
|
+
@@records[chrom].reserved = @@tbq.next(4).unpack('L').first
|
105
|
+
@@records[chrom].packed_dna_offset = @@tbq.index
|
106
|
+
|
107
|
+
@@records[chrom]
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.find_by_interval(interval)
|
111
|
+
seq = self.find_by_interval_raw(interval)
|
112
|
+
@@records[interval.chrom].n_block_intervals.map do |nb|
|
113
|
+
if interval.overlapped?(nb)
|
114
|
+
case interval.compare(nb)
|
115
|
+
when :equal,:contained_by
|
116
|
+
seq = 'N' * interval.overlap(nb)
|
117
|
+
when :contains
|
118
|
+
left_len = nb.chr_start - interval.chr_start + 1
|
119
|
+
right_len = interval.chr_end - nb.chr_end + 1
|
120
|
+
seq[0, left_len] = 'N' * left_len
|
121
|
+
seq[-right_len, right_len] = 'N' * right_len
|
122
|
+
when :left_overlapped
|
123
|
+
left_len = nb.chr_end - interval.chr_start + 1
|
124
|
+
seq[0, left_len] = 'N' * left_len
|
125
|
+
when :right_overlapped
|
126
|
+
right_len = interval.chr_end - nb.chr_start + 1
|
127
|
+
seq[-right_len, right_len] = 'N' * right_len
|
128
|
+
when :right_adjacent, :right_off
|
129
|
+
# expecting that N-blocks are sorted
|
130
|
+
# return Bio::Sequence::NA.new(seq)
|
131
|
+
seq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
#Bio::Sequence::NA.new(seq)
|
136
|
+
seq
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.find_by_interval_raw(interval)
|
140
|
+
byte_count, byte_mod = interval.zero_start.divmod 4
|
141
|
+
chrom_top = self.records(interval.chrom).packed_dna_offset
|
142
|
+
div_start, mod_start = interval.zero_start.divmod 4
|
143
|
+
div_end, mod_end = interval.zero_end.divmod 4
|
144
|
+
div_len, mod_len = interval.length.divmod 4
|
145
|
+
|
146
|
+
byte_length = div_end - div_start + 1
|
147
|
+
@@tbq.index = chrom_top + div_start
|
148
|
+
bytes = @@tbq.next(byte_length).unpack('C*')
|
149
|
+
seq = Bio::Ucsc::Hg18::Reference.bytes_to_nucleotides(bytes)
|
150
|
+
seq[mod_start..(-1-(4-mod_end))]
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.bytes_to_nucleotides(bytes)
|
154
|
+
results = ""
|
155
|
+
bytes.each do |byte|
|
156
|
+
results << Bio::Ucsc::Hg18::Reference.byte_to_nucleotides(byte)
|
157
|
+
end
|
158
|
+
results
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.byte_to_nucleotides(byte)
|
162
|
+
BINCODE[byte >> 6] +
|
163
|
+
BINCODE[(byte >> 4) & 0b11] +
|
164
|
+
BINCODE[(byte >> 2) & 0b11] +
|
165
|
+
BINCODE[byte & 0b11]
|
166
|
+
end
|
167
|
+
end # class Reference
|
168
|
+
|
169
|
+
end # module Hg18
|
170
|
+
end # module Ucsc
|
171
|
+
end # module Bio
|
data/lib/bio-ucsc/hg19.rb
CHANGED
@@ -13,24 +13,42 @@ module Bio
|
|
13
13
|
module Ucsc
|
14
14
|
module Hg19
|
15
15
|
base = File.dirname(__FILE__)
|
16
|
-
autoload :Dgv, "#{base}/hg19/dgv"
|
17
16
|
|
17
|
+
# Reference sequence retrieval via the 2bit file
|
18
|
+
autoload :Reference, "#{base}/hg19/reference"
|
19
|
+
|
20
|
+
# group: Mapping and Sequencing Tracks
|
21
|
+
## track: Chromosome Band
|
22
|
+
autoload :CytoBand, "#{base}/hg19/cytoband"
|
23
|
+
|
24
|
+
# group: Genes and Gene Prediction Tracks
|
25
|
+
## track: UCSC Genes
|
26
|
+
autoload :KnownGene, "#{base}/hg19/knowngene"
|
27
|
+
|
28
|
+
## track: RefSeq Genes
|
29
|
+
autoload :RefGene, "#{base}/hg19/refgene"
|
30
|
+
|
31
|
+
## track: Ensemble Genes
|
32
|
+
autoload :EnsGene, "#{base}/hg19/ensgene"
|
33
|
+
|
34
|
+
## track: sno/miRNA
|
35
|
+
autoload :WgRna, "#{base}/hg19/wgrna"
|
36
|
+
|
37
|
+
## track: tRNA Genes
|
38
|
+
autoload :TRNAs, "#{base}/hg19/trnas"
|
39
|
+
|
40
|
+
# group: Variation and Repeats
|
41
|
+
## track: SNPs(131)
|
18
42
|
autoload :Snp131, "#{base}/hg19/snp131"
|
19
43
|
|
44
|
+
## tracks: All/Common/Flagged/Mult SNPs(132)
|
20
45
|
autoload :Snp132, "#{base}/hg19/snp132"
|
21
46
|
autoload :Snp132Common, "#{base}/hg19/snp132common"
|
22
47
|
autoload :Snp132Flagged, "#{base}/hg19/snp132flagged"
|
23
48
|
autoload :Snp132Mult, "#{base}/hg19/snp132mult"
|
24
49
|
autoload :Snp132CodingDbSnp, "#{base}/hg19/snp132codingdbsnp"
|
25
50
|
|
26
|
-
|
27
|
-
autoload :RefGene, "#{base}/hg19/refgene"
|
28
|
-
autoload :GwasCatalog, "#{base}/hg19/gwascatalog"
|
29
|
-
autoload :CytoBand, "#{base}/hg19/cytoband"
|
30
|
-
autoload :OmimGene, "#{base}/hg19/omimgene"
|
31
|
-
autoload :WgRna, "#{base}/hg19/wgrna"
|
32
|
-
autoload :EnsGene, "#{base}/hg19/ensgene"
|
33
|
-
|
51
|
+
## track: HapMap SNPs
|
34
52
|
autoload :HapMapSnpsASW, "#{base}/hg19/hapmapsnpsasw"
|
35
53
|
autoload :HapMapSnpsCEU, "#{base}/hg19/hapmapsnpsceu"
|
36
54
|
autoload :HapMapSnpsCHB, "#{base}/hg19/hapmapsnpschb"
|
@@ -45,10 +63,29 @@ module Bio
|
|
45
63
|
autoload :HapMapAllelesChimp, "#{base}/hg19/hapmapalleleschimp"
|
46
64
|
autoload :HapMapAllelesMacaque, "#{base}/hg19/hapmapallelesmacaque"
|
47
65
|
|
66
|
+
## track: RepeatMasker
|
48
67
|
autoload :Rmsk, "#{base}/hg19/rmsk"
|
49
68
|
|
69
|
+
## track: DGV Struct Var
|
70
|
+
autoload :Dgv, "#{base}/hg19/dgv"
|
71
|
+
|
72
|
+
# group: Comparative Genomics
|
73
|
+
## track: Conservation
|
50
74
|
autoload :PhyloP46wayPrimates, "#{base}/hg19/phylop46wayprimates"
|
51
75
|
autoload :PhastConsElements46wayPrimates, "#{base}/hg19/phastconselements46wayprimates"
|
76
|
+
|
77
|
+
# group: Phenotype and Desease Association
|
78
|
+
## track: OMIM Genes
|
79
|
+
autoload :OmimGene, "#{base}/hg19/omimgene"
|
80
|
+
|
81
|
+
## track: :Gwas Catalog
|
82
|
+
autoload :GwasCatalog, "#{base}/hg19/gwascatalog"
|
83
|
+
|
84
|
+
# information tables behind tracks
|
85
|
+
autoload :Description, "#{base}/hg19/description"
|
86
|
+
autoload :GbCdnaInfo, "#{base}/hg19/gbcdnainfo"
|
87
|
+
autoload :KgXref, "#{base}/hg19/kgxref"
|
88
|
+
autoload :RefSeqSummary, "#{base}/hg19/refseqsummary"
|
52
89
|
end
|
53
90
|
end
|
54
91
|
end
|