bio-twobit 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -3
- data/README.md +17 -3
- data/Rakefile +1 -1
- data/bio-twobit.gemspec +1 -3
- data/ext/bio/twobit/2bit.c +5 -4
- data/ext/bio/twobit/twobit.c +409 -412
- data/lib/bio/twobit/version.rb +1 -1
- data/lib/bio/twobit.rb +14 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f0ce7759c6099bf6f2675eb467722473f06040a583cad13f2400c1d50f4f3b7
|
4
|
+
data.tar.gz: '09bdb93292cc70fa665a73890d8500b56084639cc3e18d6a1602bdf62d943620'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec2fb805c335562a64eca155c5806f3d004b8c4d3a2f6c2363eeb0c529749bb3f220bea4cf626d8d1618d6bdd061b247ae5a50a3f6d94314fbf6d0752bed281e
|
7
|
+
data.tar.gz: 96784953366c86e9e8e3d215a809faf0f23e6f012d3ff158ad47e80651fa2bea8cd0f87a3ca630579f7bb8e1aa41c2a30e449ba8af612ab8921d62e41521d293
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -18,8 +18,7 @@ gem install bio-twobit
|
|
18
18
|
Downlaod BSgenome.Hsapiens.UCSC.hg38
|
19
19
|
|
20
20
|
```sh
|
21
|
-
wget
|
22
|
-
tar xvf BSgenome.Hsapiens.UCSC.hg38_1.4.4.tar.gz
|
21
|
+
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
|
23
22
|
```
|
24
23
|
|
25
24
|
Quick Start
|
@@ -27,7 +26,10 @@ Quick Start
|
|
27
26
|
```ruby
|
28
27
|
require 'bio/twobit'
|
29
28
|
|
30
|
-
hg38 = Bio::TwoBit.open("
|
29
|
+
hg38 = Bio::TwoBit.open("hg38.2bit")
|
30
|
+
|
31
|
+
hg38.path
|
32
|
+
# "hg38.2bit"
|
31
33
|
|
32
34
|
hg38.info
|
33
35
|
# {"file_size"=>818064875,
|
@@ -75,6 +77,10 @@ Bio::TwoBit.open("test/fixtures/foo.2bit") do |t|
|
|
75
77
|
end
|
76
78
|
```
|
77
79
|
|
80
|
+
```ruby
|
81
|
+
tb.closed? # true / false
|
82
|
+
```
|
83
|
+
|
78
84
|
If you would like to include information about soft-masked bases, you need to manually specify `masked: true`
|
79
85
|
|
80
86
|
```ruby
|
@@ -89,10 +95,18 @@ tb.soft_masked_blocks("chr1")
|
|
89
95
|
# => [[62, 70]]
|
90
96
|
```
|
91
97
|
|
98
|
+
```ruby
|
99
|
+
tb.masked? # true / false
|
100
|
+
```
|
101
|
+
|
92
102
|
## Development
|
93
103
|
|
94
104
|
Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-on-bioc/bio-twobit.
|
95
105
|
|
106
|
+
Do you need commit rights to my repository?
|
107
|
+
Do you want to get admin rights and take over the project?
|
108
|
+
If so, please feel free to contact us @kojix2.
|
109
|
+
|
96
110
|
## License
|
97
111
|
|
98
112
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
data/bio-twobit.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
|
11
11
|
spec.summary = "A ruby library for accessing 2bit files"
|
12
12
|
spec.description = "This is a Ruby binding for lib2bit(https://github.com/dpryan79/lib2bit), " \
|
13
|
-
|
13
|
+
"which provides high-speed access to genomic data in 2bit file format."
|
14
14
|
spec.homepage = "https://github.com/ruby-on-bioc/bio-twobit"
|
15
15
|
spec.license = "MIT"
|
16
16
|
spec.required_ruby_version = ">= 2.6.0"
|
@@ -20,8 +20,6 @@ Gem::Specification.new do |spec|
|
|
20
20
|
(f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
21
21
|
end
|
22
22
|
end
|
23
|
-
spec.bindir = "exe"
|
24
|
-
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
25
23
|
spec.require_paths = ["lib"]
|
26
24
|
spec.extensions = ["ext/bio/twobit/extconf.rb"]
|
27
25
|
end
|
data/ext/bio/twobit/2bit.c
CHANGED
@@ -278,6 +278,7 @@ uint8_t getByteMaskFromOffset(int offset) {
|
|
278
278
|
void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
|
279
279
|
void *out;
|
280
280
|
uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
|
281
|
+
uint32_t seqLen = end - start;
|
281
282
|
uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
|
282
283
|
uint8_t *bytes = NULL, mask = 0, offset;
|
283
284
|
|
@@ -375,10 +376,10 @@ void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end,
|
|
375
376
|
//out is in TCAG order, since that's how 2bit is stored.
|
376
377
|
//However, for whatever reason I went with ACTG in the first release...
|
377
378
|
if(fraction) {
|
378
|
-
((double*) out)[0] = ((double) tmp[2])/((double)
|
379
|
-
((double*) out)[1] = ((double) tmp[1])/((double)
|
380
|
-
((double*) out)[2] = ((double) tmp[0])/((double)
|
381
|
-
((double*) out)[3] = ((double) tmp[3])/((double)
|
379
|
+
((double*) out)[0] = ((double) tmp[2])/((double) seqLen);
|
380
|
+
((double*) out)[1] = ((double) tmp[1])/((double) seqLen);
|
381
|
+
((double*) out)[2] = ((double) tmp[0])/((double) seqLen);
|
382
|
+
((double*) out)[3] = ((double) tmp[3])/((double) seqLen);
|
382
383
|
} else {
|
383
384
|
((uint32_t*) out)[0] = tmp[2];
|
384
385
|
((uint32_t*) out)[1] = tmp[1];
|
data/ext/bio/twobit/twobit.c
CHANGED
@@ -32,7 +32,7 @@
|
|
32
32
|
#define NUM2INT64 NUM2LONG
|
33
33
|
#define UINT64_2NUM ULONG2NUM
|
34
34
|
#define INT64_2NUM LONG2NUM
|
35
|
-
#elif
|
35
|
+
#elif SIZEOF_LONG_LONG == SIZEOF_INT64
|
36
36
|
#define NUM2UINT64 NUM2ULL
|
37
37
|
#define NUM2INT64 NUM2LL
|
38
38
|
#define UINT64_2NUM ULL2NUM
|
@@ -48,494 +48,491 @@ static void TwoBit_free(void *ptr);
|
|
48
48
|
static size_t TwoBit_memsize(const void *ptr);
|
49
49
|
|
50
50
|
static const rb_data_type_t TwoBit_type = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
0,
|
58
|
-
0,
|
59
|
-
RUBY_TYPED_FREE_IMMEDIATELY,
|
51
|
+
.wrap_struct_name = "TwoBit",
|
52
|
+
.function = {
|
53
|
+
.dfree = TwoBit_free,
|
54
|
+
.dsize = TwoBit_memsize,
|
55
|
+
},
|
56
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
60
57
|
};
|
61
58
|
|
62
59
|
static void
|
63
60
|
TwoBit_free(void *ptr)
|
64
61
|
{
|
65
|
-
|
66
|
-
|
62
|
+
// twobitClose checks for null
|
63
|
+
twobitClose(ptr);
|
67
64
|
}
|
68
65
|
|
69
66
|
static size_t
|
70
67
|
TwoBit_memsize(const void *ptr)
|
71
68
|
{
|
72
|
-
|
69
|
+
const TwoBit *data = ptr;
|
73
70
|
|
74
|
-
|
71
|
+
return data ? sizeof(*data) : 0;
|
75
72
|
}
|
76
73
|
|
77
74
|
static TwoBit *getTwoBit(VALUE self)
|
78
75
|
{
|
79
|
-
|
80
|
-
|
76
|
+
TwoBit *ptr = NULL;
|
77
|
+
TypedData_Get_Struct(self, TwoBit, &TwoBit_type, ptr);
|
81
78
|
|
82
|
-
|
79
|
+
return ptr;
|
83
80
|
}
|
84
81
|
|
85
82
|
static VALUE
|
86
83
|
twobit_allocate(VALUE klass)
|
87
84
|
{
|
88
|
-
|
85
|
+
TwoBit *tb = NULL;
|
89
86
|
|
90
|
-
|
87
|
+
return TypedData_Wrap_Struct(klass, &TwoBit_type, tb);
|
91
88
|
}
|
92
89
|
|
93
90
|
static VALUE
|
94
91
|
twobit_init(VALUE klass, VALUE fpath, VALUE storeMasked)
|
95
92
|
{
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
93
|
+
char *path = NULL;
|
94
|
+
int mask = 0;
|
95
|
+
TwoBit *tb = NULL;
|
96
|
+
|
97
|
+
path = StringValueCStr(fpath);
|
98
|
+
mask = NUM2INT(storeMasked);
|
99
|
+
|
100
|
+
tb = twobitOpen(path, mask);
|
101
|
+
if (!tb)
|
102
|
+
{
|
103
|
+
twobitClose(tb);
|
104
|
+
rb_raise(rb_eRuntimeError, "Could not open file %s", path);
|
105
|
+
return Qnil;
|
106
|
+
}
|
107
|
+
DATA_PTR(klass) = tb;
|
108
|
+
|
109
|
+
return klass;
|
113
110
|
}
|
114
111
|
|
115
112
|
static VALUE
|
116
113
|
twobit_close(VALUE self)
|
117
114
|
{
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
115
|
+
TwoBit *tb = getTwoBit(self);
|
116
|
+
if (tb)
|
117
|
+
{
|
118
|
+
twobitClose(tb);
|
119
|
+
DATA_PTR(self) = NULL;
|
120
|
+
}
|
121
|
+
|
122
|
+
return Qnil;
|
126
123
|
}
|
127
124
|
|
128
125
|
static VALUE
|
129
126
|
twobit_closed_question_mark(VALUE self)
|
130
127
|
{
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
128
|
+
TwoBit *tb = getTwoBit(self);
|
129
|
+
if (tb)
|
130
|
+
{
|
131
|
+
return Qfalse;
|
132
|
+
}
|
133
|
+
else
|
134
|
+
{
|
135
|
+
return Qtrue;
|
136
|
+
}
|
140
137
|
}
|
141
138
|
|
142
139
|
static VALUE
|
143
140
|
twobit_info(VALUE self)
|
144
141
|
{
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
142
|
+
TwoBit *tb = getTwoBit(self);
|
143
|
+
|
144
|
+
if (!tb)
|
145
|
+
{
|
146
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
147
|
+
return Qnil;
|
148
|
+
}
|
149
|
+
|
150
|
+
uint32_t i, j, foo;
|
151
|
+
VALUE val;
|
152
|
+
VALUE info = rb_hash_new();
|
153
|
+
|
154
|
+
// file size
|
155
|
+
val = UINT64_2NUM(tb->sz);
|
156
|
+
if (!val)
|
157
|
+
goto error;
|
158
|
+
rb_hash_aset(info, rb_str_new2("file_size"), val);
|
159
|
+
|
160
|
+
// nContigs
|
161
|
+
val = UINT32_2NUM(tb->hdr->nChroms);
|
162
|
+
if (!val)
|
163
|
+
goto error;
|
164
|
+
rb_hash_aset(info, rb_str_new2("nChroms"), val);
|
165
|
+
|
166
|
+
// sequence length
|
167
|
+
foo = 0;
|
168
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
169
|
+
{
|
170
|
+
foo += tb->idx->size[i];
|
171
|
+
}
|
172
|
+
val = UINT32_2NUM(foo);
|
173
|
+
if (!val)
|
174
|
+
goto error;
|
175
|
+
rb_hash_aset(info, rb_str_new2("sequence_length"), val);
|
176
|
+
|
177
|
+
// hard-masked length
|
178
|
+
foo = 0;
|
179
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
180
|
+
{
|
181
|
+
for (j = 0; j < tb->idx->nBlockCount[i]; j++)
|
182
|
+
{
|
183
|
+
foo += tb->idx->nBlockSizes[i][j];
|
184
|
+
}
|
185
|
+
}
|
186
|
+
val = UINT32_2NUM(foo);
|
187
|
+
if (!val)
|
188
|
+
goto error;
|
189
|
+
rb_hash_aset(info, rb_str_new2("hard_masked_length"), val);
|
190
|
+
|
191
|
+
// soft-masked length
|
192
|
+
if (tb->idx->maskBlockStart)
|
193
|
+
{
|
194
|
+
foo = 0;
|
195
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
196
|
+
{
|
197
|
+
for (j = 0; j < tb->idx->maskBlockCount[i]; j++)
|
198
|
+
{
|
199
|
+
foo += tb->idx->maskBlockSizes[i][j];
|
200
|
+
}
|
201
|
+
}
|
202
|
+
val = UINT32_2NUM(foo);
|
203
|
+
if (!val)
|
204
|
+
goto error;
|
205
|
+
rb_hash_aset(info, rb_str_new2("soft_masked_length"), val);
|
206
|
+
}
|
207
|
+
|
208
|
+
return info;
|
212
209
|
|
213
210
|
error:
|
214
|
-
|
215
|
-
|
211
|
+
rb_raise(rb_eRuntimeError, "Received an error while gathering information on the 2bit file!");
|
212
|
+
return Qnil;
|
216
213
|
}
|
217
214
|
|
218
215
|
static VALUE
|
219
216
|
twobit_chroms(VALUE self)
|
220
217
|
{
|
221
|
-
|
218
|
+
TwoBit *tb = getTwoBit(self);
|
222
219
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
220
|
+
if (!tb)
|
221
|
+
{
|
222
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
223
|
+
return Qnil;
|
224
|
+
}
|
228
225
|
|
229
|
-
|
230
|
-
|
231
|
-
|
226
|
+
uint32_t i;
|
227
|
+
VALUE val;
|
228
|
+
VALUE chroms = rb_hash_new();
|
232
229
|
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
230
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
231
|
+
{
|
232
|
+
val = UINT32_2NUM(tb->idx->size[i]);
|
233
|
+
if (!val)
|
234
|
+
goto error;
|
235
|
+
rb_hash_aset(chroms, rb_str_new2(tb->cl->chrom[i]), val);
|
236
|
+
}
|
240
237
|
|
241
|
-
|
238
|
+
return chroms;
|
242
239
|
|
243
240
|
error:
|
244
|
-
|
245
|
-
|
241
|
+
rb_raise(rb_eRuntimeError, "Received an error while adding an item to the output hash!");
|
242
|
+
return Qnil;
|
246
243
|
}
|
247
244
|
|
248
245
|
static VALUE
|
249
246
|
twobit_sequence(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
|
250
247
|
{
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
248
|
+
char *ch, *str;
|
249
|
+
unsigned long startl = 0, endl = 0;
|
250
|
+
uint32_t start, end, len;
|
251
|
+
TwoBit *tb;
|
252
|
+
|
253
|
+
ch = StringValueCStr(chrom);
|
254
|
+
startl = NUM2UINT32(rbstart);
|
255
|
+
endl = NUM2UINT32(rbend);
|
256
|
+
tb = getTwoBit(self);
|
257
|
+
|
258
|
+
if (!tb)
|
259
|
+
{
|
260
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
261
|
+
return Qnil;
|
262
|
+
}
|
263
|
+
|
264
|
+
len = twobitChromLen(tb, ch);
|
265
|
+
if (len == 0)
|
266
|
+
{
|
267
|
+
rb_raise(rb_eRuntimeError, "The chromosome %s does not exist in the 2bit file!", ch);
|
268
|
+
return Qnil;
|
269
|
+
}
|
270
|
+
if (endl > len)
|
271
|
+
endl = len;
|
272
|
+
end = (uint32_t)endl;
|
273
|
+
if (startl >= endl && startl > 0)
|
274
|
+
{
|
275
|
+
rb_raise(rb_eRuntimeError, "The start position %lu is greater than the end position %lu!", startl, endl);
|
276
|
+
return Qnil;
|
277
|
+
}
|
278
|
+
start = (uint32_t)startl;
|
279
|
+
|
280
|
+
str = twobitSequence(tb, ch, start, end);
|
281
|
+
|
282
|
+
return rb_str_new2(str);
|
286
283
|
}
|
287
284
|
|
288
285
|
static VALUE
|
289
286
|
twobit_bases(VALUE self, VALUE chrom, VALUE start, VALUE end, VALUE fraction)
|
290
287
|
{
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
288
|
+
char *ch;
|
289
|
+
uint32_t st, en, fr;
|
290
|
+
TwoBit *tb;
|
291
|
+
void *o = NULL;
|
292
|
+
VALUE val, hash;
|
293
|
+
|
294
|
+
tb = getTwoBit(self);
|
295
|
+
if (!tb)
|
296
|
+
{
|
297
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
298
|
+
return Qnil;
|
299
|
+
}
|
300
|
+
|
301
|
+
ch = StringValueCStr(chrom);
|
302
|
+
st = NUM2UINT32(start);
|
303
|
+
en = NUM2UINT32(end);
|
304
|
+
fr = NUM2INT(fraction);
|
305
|
+
|
306
|
+
o = twobitBases(tb, ch, st, en, fr);
|
307
|
+
if (!o)
|
308
|
+
{
|
309
|
+
rb_raise(rb_eRuntimeError, "Received an error while determining the per-base metrics.");
|
310
|
+
return Qnil;
|
311
|
+
}
|
312
|
+
|
313
|
+
hash = rb_hash_new();
|
314
|
+
|
315
|
+
if (fr)
|
316
|
+
{
|
317
|
+
val = DBL2NUM(((double *)o)[0]);
|
318
|
+
}
|
319
|
+
else
|
320
|
+
{
|
321
|
+
val = UINT32_2NUM(((uint32_t *)o)[0]);
|
322
|
+
}
|
323
|
+
rb_hash_aset(hash, rb_str_new2("A"), val);
|
324
|
+
|
325
|
+
if (fr)
|
326
|
+
{
|
327
|
+
val = DBL2NUM(((double *)o)[1]);
|
328
|
+
}
|
329
|
+
else
|
330
|
+
{
|
331
|
+
val = UINT32_2NUM(((uint32_t *)o)[1]);
|
332
|
+
}
|
333
|
+
rb_hash_aset(hash, rb_str_new2("C"), val);
|
334
|
+
|
335
|
+
if (fr)
|
336
|
+
{
|
337
|
+
val = DBL2NUM(((double *)o)[2]);
|
338
|
+
}
|
339
|
+
else
|
340
|
+
{
|
341
|
+
val = UINT32_2NUM(((uint32_t *)o)[2]);
|
342
|
+
}
|
343
|
+
rb_hash_aset(hash, rb_str_new2("T"), val);
|
344
|
+
|
345
|
+
if (fr)
|
346
|
+
{
|
347
|
+
val = DBL2NUM(((double *)o)[3]);
|
348
|
+
}
|
349
|
+
else
|
350
|
+
{
|
351
|
+
val = UINT32_2NUM(((uint32_t *)o)[3]);
|
352
|
+
}
|
353
|
+
rb_hash_aset(hash, rb_str_new2("G"), val);
|
354
|
+
|
355
|
+
free(o);
|
356
|
+
|
357
|
+
return hash;
|
361
358
|
}
|
362
359
|
|
363
360
|
static VALUE
|
364
361
|
twobit_hard_masked_blocks(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
|
365
362
|
{
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
363
|
+
char *ch;
|
364
|
+
TwoBit *tb;
|
365
|
+
long tid = -1;
|
366
|
+
unsigned long startl = 0, endl = 0, totalBlocks = 0;
|
367
|
+
uint32_t i, len, start, end, blockStart, blockEnd;
|
368
|
+
VALUE val, ary;
|
369
|
+
|
370
|
+
tb = getTwoBit(self);
|
371
|
+
ch = StringValueCStr(chrom);
|
372
|
+
startl = NUM2UINT32(rbstart);
|
373
|
+
endl = NUM2UINT32(rbend);
|
374
|
+
|
375
|
+
if (!tb)
|
376
|
+
{
|
377
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
378
|
+
return Qnil;
|
379
|
+
}
|
380
|
+
|
381
|
+
// Get the chromosome ID
|
382
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
383
|
+
{
|
384
|
+
if (strcmp(tb->cl->chrom[i], ch) == 0)
|
385
|
+
{
|
386
|
+
tid = i;
|
387
|
+
break;
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
len = twobitChromLen(tb, ch);
|
392
|
+
if (len == 0)
|
393
|
+
{
|
394
|
+
rb_raise(rb_eRuntimeError, "The chromosome %s doesn't exist in the 2bit file!", ch);
|
395
|
+
return Qnil;
|
396
|
+
}
|
397
|
+
if (endl == 0)
|
398
|
+
endl = len;
|
399
|
+
if (endl > len)
|
400
|
+
endl = len;
|
401
|
+
end = (uint32_t)endl;
|
402
|
+
if (startl > endl && startl > 0)
|
403
|
+
{
|
404
|
+
rb_raise(rb_eRuntimeError, "The start value must be less then the end value (and the end of the chromosome!");
|
405
|
+
return Qnil;
|
406
|
+
}
|
407
|
+
start = (uint32_t)startl;
|
408
|
+
|
409
|
+
// Count the total number of overlapping N-masked blocks
|
410
|
+
for (i = 0; i < tb->idx->nBlockCount[tid]; i++)
|
411
|
+
{
|
412
|
+
blockStart = tb->idx->nBlockStart[tid][i];
|
413
|
+
blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
|
414
|
+
if (blockStart < end && blockEnd > start)
|
415
|
+
{
|
416
|
+
totalBlocks++;
|
417
|
+
}
|
418
|
+
}
|
419
|
+
|
420
|
+
// Form the output
|
421
|
+
ary = rb_ary_new2(totalBlocks);
|
422
|
+
if (totalBlocks == 0)
|
423
|
+
return ary;
|
424
|
+
for (i = 0; i < tb->idx->nBlockCount[tid]; i++)
|
425
|
+
{
|
426
|
+
blockStart = tb->idx->nBlockStart[tid][i];
|
427
|
+
blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
|
428
|
+
if (blockStart < end && blockEnd > start)
|
429
|
+
{
|
430
|
+
val = rb_ary_new3(2, UINT32_2NUM(blockStart), UINT32_2NUM(blockEnd));
|
431
|
+
rb_ary_push(ary, val);
|
432
|
+
}
|
433
|
+
}
|
434
|
+
|
435
|
+
return ary;
|
439
436
|
}
|
440
437
|
|
441
438
|
static VALUE
|
442
439
|
twobit_soft_masked_blocks(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
|
443
440
|
{
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
441
|
+
char *ch;
|
442
|
+
TwoBit *tb;
|
443
|
+
long tid = -1;
|
444
|
+
unsigned long startl = 0, endl = 0, totalBlocks = 0;
|
445
|
+
uint32_t i, len, start, end, blockStart, blockEnd;
|
446
|
+
VALUE val, ary;
|
447
|
+
|
448
|
+
tb = getTwoBit(self);
|
449
|
+
ch = StringValueCStr(chrom);
|
450
|
+
startl = NUM2UINT32(rbstart);
|
451
|
+
endl = NUM2UINT32(rbend);
|
452
|
+
|
453
|
+
if (!tb)
|
454
|
+
{
|
455
|
+
rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
|
456
|
+
return Qnil;
|
457
|
+
}
|
458
|
+
|
459
|
+
// Get the chromosome ID
|
460
|
+
for (i = 0; i < tb->hdr->nChroms; i++)
|
461
|
+
{
|
462
|
+
if (strcmp(tb->cl->chrom[i], ch) == 0)
|
463
|
+
{
|
464
|
+
tid = i;
|
465
|
+
break;
|
466
|
+
}
|
467
|
+
}
|
468
|
+
|
469
|
+
len = twobitChromLen(tb, ch);
|
470
|
+
if (len == 0)
|
471
|
+
{
|
472
|
+
rb_raise(rb_eRuntimeError, "The chromosome %s doesn't exist in the 2bit file!", ch);
|
473
|
+
return Qnil;
|
474
|
+
}
|
475
|
+
if (endl == 0)
|
476
|
+
endl = len;
|
477
|
+
if (endl > len)
|
478
|
+
endl = len;
|
479
|
+
end = (uint32_t)endl;
|
480
|
+
if (startl >= endl && startl > 0)
|
481
|
+
{
|
482
|
+
rb_raise(rb_eRuntimeError, "The start value must be less then the end value (and the end of the chromosome!");
|
483
|
+
return Qnil;
|
484
|
+
}
|
485
|
+
start = (uint32_t)startl;
|
486
|
+
|
487
|
+
if (!tb->idx->maskBlockStart)
|
488
|
+
{
|
489
|
+
rb_raise(rb_eRuntimeError, "The file was not opened with storeMasked=True! Consequently, there are no stored soft-masked regions.");
|
490
|
+
return Qnil;
|
491
|
+
}
|
492
|
+
|
493
|
+
// Count the total number of overlapping N-masked blocks
|
494
|
+
for (i = 0; i < tb->idx->maskBlockCount[tid]; i++)
|
495
|
+
{
|
496
|
+
blockStart = tb->idx->maskBlockStart[tid][i];
|
497
|
+
blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
|
498
|
+
if (blockStart < end && blockEnd > start)
|
499
|
+
{
|
500
|
+
totalBlocks++;
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
// Form the output
|
505
|
+
ary = rb_ary_new2(totalBlocks);
|
506
|
+
if (totalBlocks == 0)
|
507
|
+
return ary;
|
508
|
+
for (i = 0; i < tb->idx->maskBlockCount[tid]; i++)
|
509
|
+
{
|
510
|
+
blockStart = tb->idx->maskBlockStart[tid][i];
|
511
|
+
blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
|
512
|
+
if (blockStart < end && blockEnd > start)
|
513
|
+
{
|
514
|
+
val = rb_ary_new3(2, UINT32_2NUM(blockStart), UINT32_2NUM(blockEnd));
|
515
|
+
rb_ary_push(ary, val);
|
516
|
+
}
|
517
|
+
}
|
518
|
+
|
519
|
+
return ary;
|
523
520
|
}
|
524
521
|
|
525
522
|
void Init_twobit(void)
|
526
523
|
{
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
524
|
+
mBio = rb_define_module("Bio");
|
525
|
+
mTwoBit = rb_define_class_under(mBio, "TwoBit", rb_cObject);
|
526
|
+
|
527
|
+
rb_define_alloc_func(mTwoBit, twobit_allocate);
|
528
|
+
|
529
|
+
rb_define_private_method(mTwoBit, "initialize_raw", twobit_init, 2);
|
530
|
+
rb_define_method(mTwoBit, "close", twobit_close, 0);
|
531
|
+
rb_define_method(mTwoBit, "closed?", twobit_closed_question_mark, 0);
|
532
|
+
rb_define_method(mTwoBit, "info", twobit_info, 0);
|
533
|
+
rb_define_method(mTwoBit, "chroms", twobit_chroms, 0);
|
534
|
+
rb_define_private_method(mTwoBit, "sequence_raw", twobit_sequence, 3);
|
535
|
+
rb_define_private_method(mTwoBit, "bases_raw", twobit_bases, 4);
|
536
|
+
rb_define_private_method(mTwoBit, "hard_masked_blocks_raw", twobit_hard_masked_blocks, 3);
|
537
|
+
rb_define_private_method(mTwoBit, "soft_masked_blocks_raw", twobit_soft_masked_blocks, 3);
|
541
538
|
}
|
data/lib/bio/twobit/version.rb
CHANGED
data/lib/bio/twobit.rb
CHANGED
@@ -40,19 +40,33 @@ module Bio
|
|
40
40
|
@masked
|
41
41
|
end
|
42
42
|
|
43
|
+
# Since "end" is a reserved word in Ruby, use "stop" instead.
|
44
|
+
|
43
45
|
def sequence(chrom, start = 0, stop = 0)
|
46
|
+
raise ArgumentError, "negative start position" if start.negative?
|
47
|
+
raise ArgumentError, "negative stop position" if stop.negative?
|
48
|
+
|
44
49
|
sequence_raw(chrom, start, stop)
|
45
50
|
end
|
46
51
|
|
47
52
|
def bases(chrom, start = 0, stop = 0, fraction: true)
|
53
|
+
raise ArgumentError, "negative start position" if start.negative?
|
54
|
+
raise ArgumentError, "negative stop position" if stop.negative?
|
55
|
+
|
48
56
|
bases_raw(chrom, start, stop, fraction ? 1 : 0)
|
49
57
|
end
|
50
58
|
|
51
59
|
def hard_masked_blocks(chrom, start = 0, stop = 0)
|
60
|
+
raise ArgumentError, "negative start position" if start.negative?
|
61
|
+
raise ArgumentError, "negative stop position" if stop.negative?
|
62
|
+
|
52
63
|
hard_masked_blocks_raw(chrom, start, stop)
|
53
64
|
end
|
54
65
|
|
55
66
|
def soft_masked_blocks(chrom, start = 0, stop = 0)
|
67
|
+
raise ArgumentError, "negative start position" if start.negative?
|
68
|
+
raise ArgumentError, "negative stop position" if stop.negative?
|
69
|
+
|
56
70
|
soft_masked_blocks_raw(chrom, start, stop)
|
57
71
|
end
|
58
72
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-twobit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- kojix2
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: This is a Ruby binding for lib2bit(https://github.com/dpryan79/lib2bit),
|
14
14
|
which provides high-speed access to genomic data in 2bit file format.
|
@@ -52,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
52
|
- !ruby/object:Gem::Version
|
53
53
|
version: '0'
|
54
54
|
requirements: []
|
55
|
-
rubygems_version: 3.3.
|
55
|
+
rubygems_version: 3.3.7
|
56
56
|
signing_key:
|
57
57
|
specification_version: 4
|
58
58
|
summary: A ruby library for accessing 2bit files
|