bio-twobit 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,134 @@
1
+ #include <inttypes.h>
2
+ #include <stdio.h>
3
+
4
+ /*! \mainpage libBigWig
5
+ *
6
+ * \section Introduction
7
+ *
8
+ * lib2bit is a C-based library for accessing [2bit files](https://genome.ucsc.edu/FAQ/FAQformat.html#format7). At the moment, only reading 2bit files is supported (there are no plans to change this, though if someone wants to submit a pull request...). Though it's unlikely to matter,
9
+ *
10
+ * The motivation for this project is due to needing fast access to 2bit files in [deepTools](https://github.com/fidelram/deepTools). Originally, we were using bx-python for this, which had the benefit of being easy to install and pretty quick. However, that wasn't compatible with python3, so we switched to [twobitreader](https://github.com/benjschiller/twobitreader). While doing everything we needed and working under both python2 and python3, it turns out that it has terrible performance (up to 1000x slow down in `computeGCBias`). Since we'd like to have our cake and eat it too, I began wrote a C library for convenient 2bit access and then [a python wrapper](https://github.com/dpryan79/py2bit) around it to work in python2 and 3.
11
+ *
12
+ * \section Installation
13
+ *
14
+ * 2bit files are very simple and there are no dependencies. Simply typing `make` should suffice for compilation. To install into a specific path (the default is `/usr/local`):
15
+ *
16
+ * make install prefix=/some/where/else
17
+ *
18
+ * `lib2bit.so` and `lib2bit.a` will then be in `/some/where/else/lib` and `2bit.h` in `/some/where/else/include`.
19
+ *
20
+ * \section Example
21
+ *
22
+ * See the `test/` directory for an example of using the library.
23
+ */
24
+
25
+ /*! \file 2bit.h
26
+ *
27
+ * These are all functions and structures exported in lib2bit. There are a few things that could be more efficiently implemented, but at the moment theverything is "fast enough".
28
+ */
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif
33
+
34
+ /*!
35
+ * @brief This structure holds the fixed-sized file header (16 bytes, of which 4 are blank). The version should always be 0. In theory, the endianness of the magic number can change (indicating that everything in the file should be swapped). As I've never actually seen this occur in the wild I've not bothered implementing it, though it'd be simple enough to do so.
36
+ */
37
+ typedef struct {
38
+ uint32_t magic; /**<Holds the magic number, should be 0x1A412743 */
39
+ uint32_t version; /**<File version, should be 0 */
40
+ uint32_t nChroms; /**<Number of chromosomes/contigs */
41
+ } TwoBitHeader;
42
+
43
+ /*!
44
+ * @brief This structure holds the chromosome names and the offset to the on-disk beginning of their sequences
45
+ */
46
+ typedef struct {
47
+ char **chrom; /**<A list of null terminated chromosomes */
48
+ uint32_t *offset; /**<The file offset for the beginning of each chromosome */
49
+ } TwoBitCL;
50
+
51
+ /*!
52
+ * @brief This structure holds the number, location and size of the hard (N) and soft (lower case) masked blocks.
53
+ *
54
+ * Note that this isn't a great data structure for random access, particularly for the soft-masked blocks. In practice, soft-masking is typically ignored and file access is less random and more blocky. Nonetheless, if performance is not acceptable then this is the structure to change.
55
+ */
56
+ typedef struct {
57
+ uint32_t *size; /**<The size of a given chromosome/contig */
58
+ uint32_t *nBlockCount; /**<The number of blocks of Ns in a given chromosome/contig */
59
+ uint32_t **nBlockStart; /**<For each chromosome/contig, the list (size nBlockCount) of start positions of the block of Ns */
60
+ uint32_t **nBlockSizes; /**<The size of each block specified above */
61
+ uint32_t *maskBlockCount; /**<The number of blocks of masked sequence in a given chromosome/contig */
62
+ uint32_t **maskBlockStart; /**<For each chromosome/contig, the list (size maskBlockCount) of start positions of the masked sequence blocks */
63
+ uint32_t **maskBlockSizes; /**<The size of each block specified above */
64
+ uint64_t *offset; /**<The offset to the packed 2-bit sequence */
65
+ } TwoBitMaskedIdx;
66
+
67
+ /*!
68
+ * @brief This is the main structure for holding a 2bit file
69
+ *
70
+ * Note that currently the 2bit file is mmap()ed prior to reading and that this isn't optional.
71
+ */
72
+ typedef struct {
73
+ FILE *fp; /**<The file pointer for the opened file */
74
+ uint64_t sz; /**<File size in bytes (needed for munmap) */
75
+ uint64_t offset; /**<If the file is memory mapped, then this is the current file offset (otherwise ignored) */
76
+ void *data; /**<The memory mapped file, if it exists. */
77
+ TwoBitHeader *hdr; /**<File header */
78
+ TwoBitCL *cl; /**<Chromosome list with sizes */
79
+ TwoBitMaskedIdx *idx; /**<Index of masked blocks */
80
+ } TwoBit;
81
+
82
+ /*!
83
+ * @brief Opens a local 2bit file
84
+ *
85
+ * @param fname The name of the 2bit file.
86
+ * @param storeMasked Whether soft-masking information should be stored. If this is 1 then soft-masking information will be stored and the `twobitSequence()` function will return lower case letters in soft-masked regions. Note that this has a considerable performance and memory impact.
87
+ * @return A pointer to a TwoBit object.
88
+ * @note The file is memory mapped.
89
+ */
90
+ TwoBit* twobitOpen(char *fname, int storeMasked);
91
+
92
+ /*!
93
+ * @brief Closes a 2bit file and free memory.
94
+ */
95
+ void twobitClose(TwoBit *tb);
96
+
97
+ /*!
98
+ * @brief Returns the length of a given chromosome.
99
+ *
100
+ * @param tb A pointer to a TwoBit object.
101
+ * @param chrom The chromosome name.
102
+ * @return The chromosome length as a uint32_t. Note that if the chromosome/contig isn't present in the file that 0 is returned.
103
+ */
104
+ uint32_t twobitChromLen(TwoBit *tb, char *chrom);
105
+
106
+ /*!
107
+ * @brief Returns the sequence of a chromosome/contig or range of it.
108
+ *
109
+ * @param tb A pointer to a TwoBit object.
110
+ * @param chrom The chromosome name.
111
+ * @param start The starting position in 0-based coordinates.
112
+ * @param end The end position in 1-based coordinates.
113
+ * @return The sequence or NULL on error. If both start and end are 0 then the sequence for the entire chromosome/contig is returned.
114
+ * @note The result MUST be `free()`d. Care is taken to return reasonable sequences when illegal regions are requested. If the end value is beyond the possible end of the chromosome then it is modified according.
115
+ */
116
+ char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end);
117
+
118
+ /*!
119
+ * @brief Return the number/fraction of A, C, T, and G in a chromosome/region
120
+ *
121
+ * @param tb A pointer to a TwoBit object.
122
+ * @param chrom The chromosome name.
123
+ * @param start The starting position in 0-based coordinates.
124
+ * @param end The end position in 1-based coordinates.
125
+ * @param fraction Whether to return the values as fractions (1) or integers (0).
126
+ * @return If fraction is not 0, then 4 `double`s with the fraction of bases as A, C, T and G, respectively. If fraction is 1, integer counts are returned as 4 `uint32_t`s in the aforementioned order.
127
+ * @note On error NULL is returned. The result MUST be `free()`d.
128
+ */
129
+
130
+ void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction);
131
+
132
+ #ifdef __cplusplus
133
+ }
134
+ #endif
@@ -0,0 +1,24 @@
1
+ dpryan79/lib2bit
2
+
3
+ The MIT License (MIT)
4
+
5
+ Copyright (c) 2015 Devon Ryan
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ create_makefile("bio/twobit/twobit")
@@ -0,0 +1,524 @@
1
+ #include "twobit.h"
2
+
3
+ #define SIZEOF_INT32 4
4
+ #define SIZEOF_INT64 8
5
+
6
+ #if SIZEOF_SHORT == SIZEOF_INT32
7
+ #define NUM2UINT32 NUM2USHORT
8
+ #define NUM2INT32 NUM2SHORT
9
+ #define UINT32_2NUM USHORT2NUM
10
+ #define INT32_2NUM SHORT2NUM
11
+ #elif SIZEOF_INT == SIZEOF_INT32
12
+ #define NUM2UINT32 NUM2UINT
13
+ #define NUM2INT32 NUM2INT
14
+ #define UINT32_2NUM UINT2NUM
15
+ #define INT32_2NUM INT2NUM
16
+ #elif SIZEOF_LONG == SIZEOF_INT32
17
+ #define NUM2UINT32 NUM2ULONG
18
+ #define NUM2INT32 NUM2LONG
19
+ #define UINT32_2NUM ULONG2NUM
20
+ #define INT32_2NUM LONG2NUM
21
+ #else
22
+ #error "Neither int, long, nor short is the same size as int32_t"
23
+ #endif
24
+
25
+ #if SIZEOF_INT == SIZEOF_INT64
26
+ #define NUM2UINT64 NUM2UINT
27
+ #define NUM2INT64 NUM2INT
28
+ #define UINT64_2NUM UINT2NUM
29
+ #define INT64_2NUM INT2NUM
30
+ #elif SIZEOF_LONG == SIZEOF_INT64
31
+ #define NUM2UINT64 NUM2ULONG
32
+ #define NUM2INT64 NUM2LONG
33
+ #define UINT64_2NUM ULONG2NUM
34
+ #define INT64_2NUM LONG2NUM
35
+ #elif SIZEOF_LONGLONG == SIZEOF_INT64
36
+ #define NUM2UINT64 NUM2ULL
37
+ #define NUM2INT64 NUM2LL
38
+ #define UINT64_2NUM ULL2NUM
39
+ #define INT64_2NUM LL2NUM
40
+ #else
41
+ #error "Neither int, long, nor short is the same size as int64_t"
42
+ #endif
43
+
44
+ VALUE mBio;
45
+ VALUE mTwoBit;
46
+
47
+ static void TwoBit_free(void *ptr);
48
+ static size_t TwoBit_memsize(const void *ptr);
49
+
50
+ static const rb_data_type_t TwoBit_type = {
51
+ "TwoBit",
52
+ {
53
+ 0,
54
+ TwoBit_free,
55
+ TwoBit_memsize,
56
+ },
57
+ 0,
58
+ 0,
59
+ RUBY_TYPED_FREE_IMMEDIATELY,
60
+ };
61
+
62
+ static void
63
+ TwoBit_free(void *ptr)
64
+ {
65
+ // twobitClose checks for null
66
+ twobitClose(ptr);
67
+ }
68
+
69
+ static size_t
70
+ TwoBit_memsize(const void *ptr)
71
+ {
72
+ const TwoBit *data = ptr;
73
+
74
+ return data ? sizeof(*data) : 0;
75
+ }
76
+
77
+ static TwoBit *getTwoBit(VALUE self)
78
+ {
79
+ TwoBit *ptr = NULL;
80
+ TypedData_Get_Struct(self, TwoBit, &TwoBit_type, ptr);
81
+
82
+ return ptr;
83
+ }
84
+
85
+ static VALUE
86
+ twobit_allocate(VALUE klass)
87
+ {
88
+ TwoBit *tb = NULL;
89
+
90
+ return TypedData_Wrap_Struct(klass, &TwoBit_type, tb);
91
+ }
92
+
93
+ static VALUE
94
+ twobit_init(VALUE klass, VALUE fpath, VALUE storeMasked)
95
+ {
96
+ char *path = NULL;
97
+ int mask = 0;
98
+ TwoBit *tb = NULL;
99
+
100
+ path = StringValueCStr(fpath);
101
+ mask = NUM2INT(storeMasked);
102
+
103
+ tb = twobitOpen(path, mask);
104
+ if (!tb)
105
+ {
106
+ twobitClose(tb);
107
+ rb_raise(rb_eRuntimeError, "Could not open file %s", path);
108
+ return Qnil;
109
+ }
110
+ DATA_PTR(klass) = tb;
111
+
112
+ return klass;
113
+ }
114
+
115
+ static VALUE
116
+ twobit_close(VALUE self)
117
+ {
118
+ TwoBit *tb = getTwoBit(self);
119
+ if (tb)
120
+ {
121
+ twobitClose(tb);
122
+ DATA_PTR(self) = NULL;
123
+ }
124
+
125
+ return Qnil;
126
+ }
127
+
128
+ static VALUE
129
+ twobit_info(VALUE self)
130
+ {
131
+ TwoBit *tb = getTwoBit(self);
132
+
133
+ if (!tb)
134
+ {
135
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
136
+ return Qnil;
137
+ }
138
+
139
+ uint32_t i, j, foo;
140
+ VALUE val;
141
+ VALUE info = rb_hash_new();
142
+
143
+ //file size
144
+ val = UINT64_2NUM(tb->sz);
145
+ if (!val)
146
+ goto error;
147
+ rb_hash_aset(info, rb_str_new2("file_size"), val);
148
+
149
+ //nContigs
150
+ val = UINT32_2NUM(tb->hdr->nChroms);
151
+ if (!val)
152
+ goto error;
153
+ rb_hash_aset(info, rb_str_new2("nChroms"), val);
154
+
155
+ //sequence length
156
+ foo = 0;
157
+ for (i = 0; i < tb->hdr->nChroms; i++)
158
+ {
159
+ foo += tb->idx->size[i];
160
+ }
161
+ val = UINT32_2NUM(foo);
162
+ if (!val)
163
+ goto error;
164
+ rb_hash_aset(info, rb_str_new2("sequence_length"), val);
165
+
166
+ //hard-masked length
167
+ foo = 0;
168
+ for (i = 0; i < tb->hdr->nChroms; i++)
169
+ {
170
+ for (j = 0; j < tb->idx->nBlockCount[i]; j++)
171
+ {
172
+ foo += tb->idx->nBlockSizes[i][j];
173
+ }
174
+ }
175
+ val = UINT32_2NUM(foo);
176
+ if (!val)
177
+ goto error;
178
+ rb_hash_aset(info, rb_str_new2("hard_masked_length"), val);
179
+
180
+ //soft-masked length
181
+ if (tb->idx->maskBlockStart)
182
+ {
183
+ foo = 0;
184
+ for (i = 0; i < tb->hdr->nChroms; i++)
185
+ {
186
+ for (j = 0; j < tb->idx->maskBlockCount[i]; j++)
187
+ {
188
+ foo += tb->idx->maskBlockSizes[i][j];
189
+ }
190
+ }
191
+ val = UINT32_2NUM(foo);
192
+ if (!val)
193
+ goto error;
194
+ rb_hash_aset(info, rb_str_new2("soft_masked_length"), val);
195
+ }
196
+
197
+ return info;
198
+
199
+ error:
200
+ rb_raise(rb_eRuntimeError, "Received an error while gathering information on the 2bit file!");
201
+ return Qnil;
202
+ }
203
+
204
+ static VALUE
205
+ twobit_chroms(VALUE self)
206
+ {
207
+ TwoBit *tb = getTwoBit(self);
208
+
209
+ if (!tb)
210
+ {
211
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
212
+ return Qnil;
213
+ }
214
+
215
+ uint32_t i;
216
+ VALUE val;
217
+ VALUE chroms = rb_hash_new();
218
+
219
+ for (i = 0; i < tb->hdr->nChroms; i++)
220
+ {
221
+ val = UINT32_2NUM(tb->idx->size[i]);
222
+ if (!val)
223
+ goto error;
224
+ rb_hash_aset(chroms, rb_str_new2(tb->cl->chrom[i]), val);
225
+ }
226
+
227
+ return chroms;
228
+
229
+ error:
230
+ rb_raise(rb_eRuntimeError, "Received an error while adding an item to the output hash!");
231
+ return Qnil;
232
+ }
233
+
234
+ static VALUE
235
+ twobit_sequence(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
236
+ {
237
+ char *ch, *str;
238
+ unsigned long startl = 0, endl = 0;
239
+ uint32_t start, end, len;
240
+ TwoBit *tb;
241
+
242
+ ch = StringValueCStr(chrom);
243
+ startl = NUM2UINT32(rbstart);
244
+ endl = NUM2UINT32(rbend);
245
+ tb = getTwoBit(self);
246
+
247
+ if (!tb)
248
+ {
249
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
250
+ return Qnil;
251
+ }
252
+
253
+ len = twobitChromLen(tb, ch);
254
+ if (len == 0)
255
+ {
256
+ rb_raise(rb_eRuntimeError, "The chromosome %s does not exist in the 2bit file!", ch);
257
+ return Qnil;
258
+ }
259
+ if (endl > len)
260
+ endl = len;
261
+ end = (uint32_t)endl;
262
+ if (startl >= endl && startl > 0)
263
+ {
264
+ rb_raise(rb_eRuntimeError, "The start position %lu is greater than the end position %lu!", startl, endl);
265
+ return Qnil;
266
+ }
267
+ start = (uint32_t)startl;
268
+
269
+ str = twobitSequence(tb, ch, start, end);
270
+
271
+ return rb_str_new2(str);
272
+ }
273
+
274
+ static VALUE
275
+ twobit_bases(VALUE self, VALUE chrom, VALUE start, VALUE end, VALUE fraction)
276
+ {
277
+ char *ch;
278
+ uint32_t st, en, fr;
279
+ TwoBit *tb;
280
+ void *o = NULL;
281
+ VALUE val, hash;
282
+
283
+ tb = getTwoBit(self);
284
+ if (!tb)
285
+ {
286
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
287
+ return Qnil;
288
+ }
289
+
290
+ ch = StringValueCStr(chrom);
291
+ st = NUM2UINT32(start);
292
+ en = NUM2UINT32(end);
293
+ fr = NUM2INT(fraction);
294
+
295
+ o = twobitBases(tb, ch, st, en, fr);
296
+ if (!o)
297
+ {
298
+ rb_raise(rb_eRuntimeError, "Received an error while determining the per-base metrics.");
299
+ return Qnil;
300
+ }
301
+
302
+ hash = rb_hash_new();
303
+
304
+ if (fr)
305
+ {
306
+ val = DBL2NUM(((double *)o)[0]);
307
+ }
308
+ else
309
+ {
310
+ val = UINT32_2NUM(((uint32_t *)o)[0]);
311
+ }
312
+ rb_hash_aset(hash, rb_str_new2("A"), val);
313
+
314
+ if (fr)
315
+ {
316
+ val = DBL2NUM(((double *)o)[1]);
317
+ }
318
+ else
319
+ {
320
+ val = UINT32_2NUM(((uint32_t *)o)[1]);
321
+ }
322
+ rb_hash_aset(hash, rb_str_new2("C"), val);
323
+
324
+ if (fr)
325
+ {
326
+ val = DBL2NUM(((double *)o)[2]);
327
+ }
328
+ else
329
+ {
330
+ val = UINT32_2NUM(((uint32_t *)o)[2]);
331
+ }
332
+ rb_hash_aset(hash, rb_str_new2("T"), val);
333
+
334
+ if (fr)
335
+ {
336
+ val = DBL2NUM(((double *)o)[3]);
337
+ }
338
+ else
339
+ {
340
+ val = UINT32_2NUM(((uint32_t *)o)[3]);
341
+ }
342
+ rb_hash_aset(hash, rb_str_new2("G"), val);
343
+
344
+ free(o);
345
+
346
+ return hash;
347
+ }
348
+
349
+ static VALUE
350
+ twobit_hard_masked_blocks(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
351
+ {
352
+ char *ch;
353
+ TwoBit *tb;
354
+ unsigned long startl = 0, endl = 0, totalBlocks = 0, tid;
355
+ uint32_t i, len, start, end, blockStart, blockEnd;
356
+ VALUE val, ary;
357
+
358
+ tb = getTwoBit(self);
359
+ ch = StringValueCStr(chrom);
360
+ startl = NUM2UINT32(rbstart);
361
+ endl = NUM2UINT32(rbend);
362
+
363
+ if (!tb)
364
+ {
365
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
366
+ return Qnil;
367
+ }
368
+
369
+ //Get the chromosome ID
370
+ for (i = 0; i < tb->hdr->nChroms; i++)
371
+ {
372
+ if (strcmp(tb->cl->chrom[i], ch) == 0)
373
+ {
374
+ tid = i;
375
+ break;
376
+ }
377
+ }
378
+
379
+ len = twobitChromLen(tb, ch);
380
+ if (len == 0)
381
+ {
382
+ rb_raise(rb_eRuntimeError, "The chromosome %s doesn't exist in the 2bit file!", ch);
383
+ return Qnil;
384
+ }
385
+ if (endl == 0)
386
+ endl = len;
387
+ if (endl > len)
388
+ endl = len;
389
+ end = (uint32_t)endl;
390
+ if (startl > endl && startl > 0)
391
+ {
392
+ rb_raise(rb_eRuntimeError, "The start value must be less then the end value (and the end of the chromosome!");
393
+ return Qnil;
394
+ }
395
+ start = (uint32_t)startl;
396
+
397
+ //Count the total number of overlapping N-masked blocks
398
+ for (i = 0; i < tb->idx->nBlockCount[tid]; i++)
399
+ {
400
+ blockStart = tb->idx->nBlockStart[tid][i];
401
+ blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
402
+ if (blockStart < end && blockEnd > start)
403
+ {
404
+ totalBlocks++;
405
+ }
406
+ }
407
+
408
+ //Form the output
409
+ ary = rb_ary_new2(totalBlocks);
410
+ if (totalBlocks == 0)
411
+ return ary;
412
+ for (i = 0; i < tb->idx->nBlockCount[tid]; i++)
413
+ {
414
+ blockStart = tb->idx->nBlockStart[tid][i];
415
+ blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
416
+ if (blockStart < end && blockEnd > start)
417
+ {
418
+ val = rb_ary_new3(2, UINT32_2NUM(blockStart), UINT32_2NUM(blockEnd));
419
+ rb_ary_push(ary, val);
420
+ }
421
+ }
422
+
423
+ return ary;
424
+ }
425
+
426
+ static VALUE
427
+ twobit_soft_masked_blocks(VALUE self, VALUE chrom, VALUE rbstart, VALUE rbend)
428
+ {
429
+ char *ch;
430
+ TwoBit *tb;
431
+ unsigned long startl = 0, endl = 0, totalBlocks = 0, tid;
432
+ uint32_t i, len, start, end, blockStart, blockEnd;
433
+ VALUE val, ary;
434
+
435
+ tb = getTwoBit(self);
436
+ ch = StringValueCStr(chrom);
437
+ startl = NUM2UINT32(rbstart);
438
+ endl = NUM2UINT32(rbend);
439
+
440
+ if (!tb)
441
+ {
442
+ rb_raise(rb_eRuntimeError, "The 2bit file handle is not open!");
443
+ return Qnil;
444
+ }
445
+
446
+ //Get the chromosome ID
447
+ for (i = 0; i < tb->hdr->nChroms; i++)
448
+ {
449
+ if (strcmp(tb->cl->chrom[i], ch) == 0)
450
+ {
451
+ tid = i;
452
+ break;
453
+ }
454
+ }
455
+
456
+ len = twobitChromLen(tb, ch);
457
+ if (len == 0)
458
+ {
459
+ rb_raise(rb_eRuntimeError, "The chromosome %s doesn't exist in the 2bit file!", ch);
460
+ return Qnil;
461
+ }
462
+ if (endl == 0)
463
+ endl = len;
464
+ if (endl > len)
465
+ endl = len;
466
+ end = (uint32_t)endl;
467
+ if (startl >= endl && startl > 0)
468
+ {
469
+ rb_raise(rb_eRuntimeError, "The start value must be less then the end value (and the end of the chromosome!");
470
+ return Qnil;
471
+ }
472
+ start = (uint32_t)startl;
473
+
474
+ if (!tb->idx->maskBlockStart)
475
+ {
476
+ rb_raise(rb_eRuntimeError, "The file was not opened with storeMasked=True! Consequently, there are no stored soft-masked regions.");
477
+ return Qnil;
478
+ }
479
+
480
+ //Count the total number of overlapping N-masked blocks
481
+ for (i = 0; i < tb->idx->maskBlockCount[tid]; i++)
482
+ {
483
+ blockStart = tb->idx->maskBlockStart[tid][i];
484
+ blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
485
+ if (blockStart < end && blockEnd > start)
486
+ {
487
+ totalBlocks++;
488
+ }
489
+ }
490
+
491
+ //Form the output
492
+ ary = rb_ary_new2(totalBlocks);
493
+ if (totalBlocks == 0)
494
+ return ary;
495
+ for (i = 0; i < tb->idx->maskBlockCount[tid]; i++)
496
+ {
497
+ blockStart = tb->idx->maskBlockStart[tid][i];
498
+ blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
499
+ if (blockStart < end && blockEnd > start)
500
+ {
501
+ val = rb_ary_new3(2, UINT32_2NUM(blockStart), UINT32_2NUM(blockEnd));
502
+ rb_ary_push(ary, val);
503
+ }
504
+ }
505
+
506
+ return ary;
507
+ }
508
+
509
+ void Init_twobit(void)
510
+ {
511
+ mBio = rb_define_module("Bio");
512
+ mTwoBit = rb_define_class_under(mBio, "TwoBit", rb_cObject);
513
+
514
+ rb_define_alloc_func(mTwoBit, twobit_allocate);
515
+
516
+ rb_define_private_method(mTwoBit, "initialize_raw", twobit_init, 2);
517
+ rb_define_method(mTwoBit, "close", twobit_close, 0);
518
+ rb_define_method(mTwoBit, "info", twobit_info, 0);
519
+ rb_define_method(mTwoBit, "chroms", twobit_chroms, 0);
520
+ rb_define_private_method(mTwoBit, "sequence_raw", twobit_sequence, 3);
521
+ rb_define_private_method(mTwoBit, "bases_raw", twobit_bases, 4);
522
+ rb_define_private_method(mTwoBit, "hard_masked_blocks_raw", twobit_hard_masked_blocks, 3);
523
+ rb_define_private_method(mTwoBit, "soft_masked_blocks_raw", twobit_soft_masked_blocks, 3);
524
+ }
@@ -0,0 +1,7 @@
1
+ #ifndef TWOBIT_H
2
+ #define TWOBIT_H 1
3
+
4
+ #include "ruby.h"
5
+ #include "2bit.h"
6
+
7
+ #endif /* TWOBIT_H */
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bio
4
+ class TwoBit
5
+ VERSION = "0.1.1"
6
+ end
7
+ end