bio-twobit 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +13 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +98 -0
- data/Rakefile +25 -0
- data/bio-twobit.gemspec +29 -0
- data/ext/bio/twobit/2bit.c +734 -0
- data/ext/bio/twobit/2bit.h +134 -0
- data/ext/bio/twobit/LICENSE +24 -0
- data/ext/bio/twobit/extconf.rb +5 -0
- data/ext/bio/twobit/twobit.c +524 -0
- data/ext/bio/twobit/twobit.h +7 -0
- data/lib/bio/twobit/version.rb +7 -0
- data/lib/bio/twobit.rb +40 -0
- metadata +59 -0
@@ -0,0 +1,734 @@
|
|
1
|
+
#include <sys/types.h>
|
2
|
+
#include <sys/stat.h>
|
3
|
+
#include <sys/mman.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include <inttypes.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <string.h>
|
9
|
+
#include <ctype.h>
|
10
|
+
#include "2bit.h"
|
11
|
+
|
12
|
+
uint64_t twobitTell(TwoBit *tb);
|
13
|
+
|
14
|
+
/*
|
15
|
+
Read nmemb elements, each of size sz from the current file offset
|
16
|
+
into data. Return the number of elements read. On error, the return
|
17
|
+
value is either 0 or less than nmemb
|
18
|
+
*/
|
19
|
+
size_t twobitRead(void *data, size_t sz, size_t nmemb, TwoBit *tb) {
|
20
|
+
if(tb->data) {
|
21
|
+
if(memcpy(data, tb->data + tb->offset, nmemb * sz) == NULL) return 0;
|
22
|
+
tb->offset += nmemb * sz;
|
23
|
+
return nmemb;
|
24
|
+
} else {
|
25
|
+
return fread(data, sz, nmemb, tb->fp);
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
/*
|
30
|
+
Seek to a specific position, which is essentially trivial for memmaped stuff
|
31
|
+
|
32
|
+
Returns: 0 on success, -1 on error
|
33
|
+
*/
|
34
|
+
int twobitSeek(TwoBit *tb, uint64_t offset) {
|
35
|
+
if(offset >= tb->sz) return -1;
|
36
|
+
if(tb->data) {
|
37
|
+
tb->offset = offset;
|
38
|
+
return 0;
|
39
|
+
} else {
|
40
|
+
return fseek(tb->fp, (long) offset, SEEK_SET);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
/*
|
45
|
+
Like ftell, but generalized to handle memmaped files
|
46
|
+
|
47
|
+
Returns the offset
|
48
|
+
*/
|
49
|
+
uint64_t twobitTell(TwoBit *tb) {
|
50
|
+
if(tb->data) return tb->offset;
|
51
|
+
return (uint64_t) ftell(tb->fp);
|
52
|
+
}
|
53
|
+
|
54
|
+
/*
|
55
|
+
Given a byte containing 4 bases, return the character representation of the offset'th base
|
56
|
+
*/
|
57
|
+
char byte2base(uint8_t byte, int offset) {
|
58
|
+
int rev = 3 - offset;
|
59
|
+
uint8_t mask = 3 << (2 * rev);
|
60
|
+
int foo = (mask & byte) >> (2 * rev);
|
61
|
+
char bases[4] = "TCAG";
|
62
|
+
return bases[foo];
|
63
|
+
}
|
64
|
+
|
65
|
+
void bytes2bases(char *seq, uint8_t *byte, uint32_t sz, int offset) {
|
66
|
+
uint32_t pos = 0, remainder = 0, i = 0;
|
67
|
+
char bases[4] = "TCAG";
|
68
|
+
uint8_t foo = byte[0];
|
69
|
+
|
70
|
+
// Deal with the first partial byte
|
71
|
+
if(offset != 0) {
|
72
|
+
while(offset < 4) {
|
73
|
+
seq[pos++] = byte2base(foo, offset++);
|
74
|
+
}
|
75
|
+
if(pos >= sz) return;
|
76
|
+
foo = byte[++i];
|
77
|
+
}
|
78
|
+
|
79
|
+
// Deal with everything else, with the possible exception of the last fractional byte
|
80
|
+
remainder = (sz - pos) % 4;
|
81
|
+
while(pos < sz - remainder) {
|
82
|
+
foo = byte[i++];
|
83
|
+
seq[pos + 3] = bases[foo & 3];
|
84
|
+
foo >>= 2;
|
85
|
+
seq[pos + 2] = bases[foo & 3];
|
86
|
+
foo >>= 2;
|
87
|
+
seq[pos + 1] = bases[foo & 3];
|
88
|
+
foo >>= 2;
|
89
|
+
seq[pos] = bases[foo & 3];
|
90
|
+
foo >>= 2;
|
91
|
+
pos += 4;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Deal with the last partial byte
|
95
|
+
if(remainder > 0) foo = byte[i];
|
96
|
+
for(offset=0; offset<remainder; offset++) {
|
97
|
+
seq[pos++] = byte2base(foo, offset);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
/*
|
102
|
+
Replace Ts (or whatever else is being used) with N as appropriate
|
103
|
+
*/
|
104
|
+
void NMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
105
|
+
uint32_t i, width, pos = 0;
|
106
|
+
uint32_t blockStart, blockEnd;
|
107
|
+
|
108
|
+
for(i=0; i<tb->idx->nBlockCount[tid]; i++) {
|
109
|
+
blockStart = tb->idx->nBlockStart[tid][i];
|
110
|
+
blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
|
111
|
+
if(blockEnd <= start) continue;
|
112
|
+
if(blockStart >= end) break;
|
113
|
+
if(blockStart < start) {
|
114
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
115
|
+
pos = 0;
|
116
|
+
width = blockEnd - start;
|
117
|
+
} else {
|
118
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
119
|
+
pos = blockStart - start;
|
120
|
+
width = blockEnd - blockStart;
|
121
|
+
}
|
122
|
+
width += pos;
|
123
|
+
for(; pos < width; pos++) seq[pos] = 'N';
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
/*
|
128
|
+
Replace uppercase with lower-case letters, if required
|
129
|
+
*/
|
130
|
+
void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
131
|
+
uint32_t i, width, pos = 0;
|
132
|
+
uint32_t blockStart, blockEnd;
|
133
|
+
|
134
|
+
if(!tb->idx->maskBlockStart) return;
|
135
|
+
|
136
|
+
for(i=0; i<tb->idx->maskBlockCount[tid]; i++) {
|
137
|
+
blockStart = tb->idx->maskBlockStart[tid][i];
|
138
|
+
blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
|
139
|
+
if(blockEnd <= start) continue;
|
140
|
+
if(blockStart >= end) break;
|
141
|
+
if(blockStart < start) {
|
142
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
143
|
+
pos = 0;
|
144
|
+
width = blockEnd - start;
|
145
|
+
} else {
|
146
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
147
|
+
pos = blockStart - start;
|
148
|
+
width = blockEnd - blockStart;
|
149
|
+
}
|
150
|
+
width += pos;
|
151
|
+
for(; pos < width; pos++) {
|
152
|
+
if(seq[pos] != 'N') seq[pos] = tolower(seq[pos]);
|
153
|
+
}
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
/*
|
158
|
+
This is the worker function for twobitSequence, which mostly does error checking
|
159
|
+
*/
|
160
|
+
char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
161
|
+
uint32_t sz = end - start + 1;
|
162
|
+
uint32_t blockStart, blockEnd;
|
163
|
+
char *seq = malloc(sz * sizeof(char));
|
164
|
+
uint8_t *bytes = NULL;
|
165
|
+
int offset;
|
166
|
+
if(!seq) return NULL;
|
167
|
+
|
168
|
+
//There are 4 bases/byte
|
169
|
+
blockStart = start/4;
|
170
|
+
offset = start % 4;
|
171
|
+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
|
172
|
+
bytes = malloc(blockEnd - blockStart);
|
173
|
+
if(!bytes) goto error;
|
174
|
+
|
175
|
+
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
|
176
|
+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
|
177
|
+
bytes2bases(seq, bytes, sz - 1, offset);
|
178
|
+
free(bytes);
|
179
|
+
|
180
|
+
//Null terminate the output
|
181
|
+
seq[sz - 1] = '\0';
|
182
|
+
|
183
|
+
//N-mask everything
|
184
|
+
NMask(seq, tb, tid, start, end);
|
185
|
+
|
186
|
+
//Soft-mask if requested
|
187
|
+
softMask(seq, tb, tid, start, end);
|
188
|
+
|
189
|
+
return seq;
|
190
|
+
|
191
|
+
error:
|
192
|
+
if(seq) free(seq);
|
193
|
+
if(bytes) free(bytes);
|
194
|
+
return NULL;
|
195
|
+
}
|
196
|
+
|
197
|
+
/*
|
198
|
+
Given a chromosome, name, and optional range, return the corresponding sequence.
|
199
|
+
|
200
|
+
The start and end or 0-based half-open, so end-start is the number of bases.
|
201
|
+
If both start and end are 0, then the whole chromosome is used.
|
202
|
+
|
203
|
+
On error (e.g., a missing chromosome), NULL is returned.
|
204
|
+
*/
|
205
|
+
char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
|
206
|
+
uint32_t i, tid=0;
|
207
|
+
|
208
|
+
//Get the chromosome ID
|
209
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
210
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) {
|
211
|
+
tid = i;
|
212
|
+
break;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
|
216
|
+
|
217
|
+
//Get the start/end if not specified
|
218
|
+
if(start == end && end == 0) {
|
219
|
+
end = tb->idx->size[tid];
|
220
|
+
}
|
221
|
+
|
222
|
+
//Sanity check the bounds
|
223
|
+
if(end > tb->idx->size[tid]) return NULL;
|
224
|
+
if(start >= end) return NULL;
|
225
|
+
|
226
|
+
return constructSequence(tb, tid, start, end);
|
227
|
+
}
|
228
|
+
|
229
|
+
/*
|
230
|
+
Given a tid and a position, set the various mask variables to an appropriate block of Ns.
|
231
|
+
|
232
|
+
* If maskIdx is not -1, these are set to the first overlapping block (or maskIdx is set to the number of N blocks).
|
233
|
+
* If maskIdx is not -1 then it's incremented and maskStart/maskEnd set appropriately.
|
234
|
+
|
235
|
+
If the returned interval doesn't overlap the start/end range, then both values will be -1.
|
236
|
+
*/
|
237
|
+
void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd) {
|
238
|
+
if(*maskIdx == (uint32_t) -1) {
|
239
|
+
for((*maskIdx)=0; (*maskIdx)<tb->idx->nBlockCount[tid]; (*maskIdx)++) {
|
240
|
+
*maskStart = tb->idx->nBlockStart[tid][*maskIdx];
|
241
|
+
*maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
|
242
|
+
if(*maskEnd < start) continue;
|
243
|
+
if(*maskEnd >= start) break;
|
244
|
+
}
|
245
|
+
} else if(*maskIdx >= tb->idx->nBlockCount[tid]) {
|
246
|
+
*maskStart = (uint32_t) -1;
|
247
|
+
*maskEnd = (uint32_t) -1;
|
248
|
+
} else {
|
249
|
+
*maskIdx += 1;
|
250
|
+
if(*maskIdx >= tb->idx->nBlockCount[tid]) {
|
251
|
+
*maskStart = (uint32_t) -1;
|
252
|
+
*maskEnd = (uint32_t) -1;
|
253
|
+
} else {
|
254
|
+
*maskStart = tb->idx->nBlockStart[tid][*maskIdx];
|
255
|
+
*maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
//maskStart = maskEnd = -1 if no overlap
|
260
|
+
if(*maskIdx >= tb->idx->nBlockCount[tid] || *maskStart >= end) {
|
261
|
+
*maskStart = (uint32_t) -1;
|
262
|
+
*maskEnd = (uint32_t) -1;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
|
266
|
+
uint8_t getByteMaskFromOffset(int offset) {
|
267
|
+
switch(offset) {
|
268
|
+
case 0:
|
269
|
+
return (uint8_t) 15;
|
270
|
+
case 1:
|
271
|
+
return (uint8_t) 7;
|
272
|
+
case 2:
|
273
|
+
return (uint8_t) 3;
|
274
|
+
}
|
275
|
+
return 1;
|
276
|
+
}
|
277
|
+
|
278
|
+
void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
|
279
|
+
void *out;
|
280
|
+
uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
|
281
|
+
uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
|
282
|
+
uint8_t *bytes = NULL, mask = 0, offset;
|
283
|
+
|
284
|
+
if(fraction) {
|
285
|
+
out = malloc(4 * sizeof(double));
|
286
|
+
} else {
|
287
|
+
out = malloc(4 * sizeof(uint32_t));
|
288
|
+
}
|
289
|
+
if(!out) return NULL;
|
290
|
+
|
291
|
+
//There are 4 bases/byte
|
292
|
+
blockStart = start/4;
|
293
|
+
offset = start % 4;
|
294
|
+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
|
295
|
+
bytes = malloc(blockEnd - blockStart);
|
296
|
+
if(!bytes) goto error;
|
297
|
+
|
298
|
+
//Set the initial mask, reset start/offset so we always deal with full bytes
|
299
|
+
mask = getByteMaskFromOffset(offset);
|
300
|
+
start = 4 * blockStart;
|
301
|
+
offset = 0;
|
302
|
+
|
303
|
+
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
|
304
|
+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
|
305
|
+
|
306
|
+
//Get the index/start/end of the next N-mask block
|
307
|
+
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
|
308
|
+
|
309
|
+
while(i < len) {
|
310
|
+
// Check if we need to jump
|
311
|
+
if(maskIdx != -1 && start + i + 4 >= maskStart) {
|
312
|
+
if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
|
313
|
+
//Jump iff the whole byte is inside an N block
|
314
|
+
if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
|
315
|
+
//iff we're fully in an N block then jump
|
316
|
+
i = maskEnd - start;
|
317
|
+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
|
318
|
+
offset = (start + i) % 4;
|
319
|
+
j = i / 4;
|
320
|
+
mask = getByteMaskFromOffset(offset);
|
321
|
+
i = 4 * j; //Now that the mask has been set, reset i to byte offsets
|
322
|
+
offset = 0;
|
323
|
+
continue;
|
324
|
+
}
|
325
|
+
|
326
|
+
//Set the mask, if appropriate
|
327
|
+
foo = 4*j + 4*blockStart; // The smallest position in the byte
|
328
|
+
if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
|
329
|
+
if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
|
330
|
+
if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
|
331
|
+
if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
|
332
|
+
if(foo + 4 > maskEnd) {
|
333
|
+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
|
334
|
+
continue;
|
335
|
+
}
|
336
|
+
}
|
337
|
+
}
|
338
|
+
|
339
|
+
//Ensure that anything after then end is masked
|
340
|
+
if(i+4>=len) {
|
341
|
+
if((mask & 1) && i+3>=len) mask -=1;
|
342
|
+
if((mask & 2) && i+2>=len) mask -=2;
|
343
|
+
if((mask & 4) && i+1>=len) mask -=4;
|
344
|
+
if((mask & 8) && i>=len) mask -=8;
|
345
|
+
}
|
346
|
+
|
347
|
+
foo = bytes[j++];
|
348
|
+
//Offset 3
|
349
|
+
if(mask & 1) {
|
350
|
+
tmp[foo & 3]++;
|
351
|
+
}
|
352
|
+
foo >>= 2;
|
353
|
+
mask >>= 1;
|
354
|
+
//Offset 2
|
355
|
+
if(mask & 1) {
|
356
|
+
tmp[foo & 3]++;
|
357
|
+
}
|
358
|
+
foo >>= 2;
|
359
|
+
mask >>= 1;
|
360
|
+
//Offset 1
|
361
|
+
if(mask & 1) {
|
362
|
+
tmp[foo & 3]++;
|
363
|
+
}
|
364
|
+
foo >>= 2;
|
365
|
+
mask >>= 1;
|
366
|
+
//Offset 0
|
367
|
+
if(mask & 1) {
|
368
|
+
tmp[foo & 3]++; // offset 0
|
369
|
+
}
|
370
|
+
i += 4;
|
371
|
+
mask = 15;
|
372
|
+
}
|
373
|
+
free(bytes);
|
374
|
+
|
375
|
+
//out is in TCAG order, since that's how 2bit is stored.
|
376
|
+
//However, for whatever reason I went with ACTG in the first release...
|
377
|
+
if(fraction) {
|
378
|
+
((double*) out)[0] = ((double) tmp[2])/((double) len);
|
379
|
+
((double*) out)[1] = ((double) tmp[1])/((double) len);
|
380
|
+
((double*) out)[2] = ((double) tmp[0])/((double) len);
|
381
|
+
((double*) out)[3] = ((double) tmp[3])/((double) len);
|
382
|
+
} else {
|
383
|
+
((uint32_t*) out)[0] = tmp[2];
|
384
|
+
((uint32_t*) out)[1] = tmp[1];
|
385
|
+
((uint32_t*) out)[2] = tmp[0];
|
386
|
+
((uint32_t*) out)[3] = tmp[3];
|
387
|
+
}
|
388
|
+
|
389
|
+
return out;
|
390
|
+
|
391
|
+
error:
|
392
|
+
if(out) free(out);
|
393
|
+
if(bytes) free(bytes);
|
394
|
+
return NULL;
|
395
|
+
}
|
396
|
+
|
397
|
+
void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction) {
|
398
|
+
uint32_t tid = 0, i;
|
399
|
+
|
400
|
+
//Get the chromosome ID
|
401
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
402
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) {
|
403
|
+
tid = i;
|
404
|
+
break;
|
405
|
+
}
|
406
|
+
}
|
407
|
+
|
408
|
+
if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
|
409
|
+
|
410
|
+
//Get the start/end if not specified
|
411
|
+
if(start == end && end == 0) {
|
412
|
+
end = tb->idx->size[tid];
|
413
|
+
}
|
414
|
+
|
415
|
+
//Sanity check the bounds
|
416
|
+
if(end > tb->idx->size[tid]) return NULL;
|
417
|
+
if(start >= end) return NULL;
|
418
|
+
|
419
|
+
return twobitBasesWorker(tb, tid, start, end, fraction);
|
420
|
+
}
|
421
|
+
|
422
|
+
/*
|
423
|
+
Given a chromosome, chrom, return it's length. 0 is used if the chromosome isn't present.
|
424
|
+
*/
|
425
|
+
uint32_t twobitChromLen(TwoBit *tb, char *chrom) {
|
426
|
+
uint32_t i;
|
427
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
428
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) return tb->idx->size[i];
|
429
|
+
}
|
430
|
+
return 0;
|
431
|
+
}
|
432
|
+
|
433
|
+
/*
|
434
|
+
Fill in tb->idx.
|
435
|
+
|
436
|
+
Note that the masked stuff will only be stored if storeMasked == 1, since it uses gobs of memory otherwise.
|
437
|
+
On error, tb->idx is left as NULL.
|
438
|
+
*/
|
439
|
+
void twobitIndexRead(TwoBit *tb, int storeMasked) {
|
440
|
+
uint32_t i, data[2];
|
441
|
+
TwoBitMaskedIdx *idx = calloc(1, sizeof(TwoBitMaskedIdx));
|
442
|
+
|
443
|
+
//Allocation and error checking
|
444
|
+
if(!idx) return;
|
445
|
+
idx->size = malloc(tb->hdr->nChroms * sizeof(uint32_t));
|
446
|
+
idx->nBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
|
447
|
+
idx->nBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
448
|
+
idx->nBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
449
|
+
if(!idx->size) goto error;
|
450
|
+
if(!idx->nBlockCount) goto error;
|
451
|
+
if(!idx->nBlockStart) goto error;
|
452
|
+
if(!idx->nBlockSizes) goto error;
|
453
|
+
idx->maskBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
|
454
|
+
if(!idx->maskBlockCount) goto error;
|
455
|
+
if(storeMasked) {
|
456
|
+
idx->maskBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
457
|
+
idx->maskBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
458
|
+
if(!idx->maskBlockStart) goto error;
|
459
|
+
if(!idx->maskBlockSizes) goto error;
|
460
|
+
}
|
461
|
+
idx->offset = malloc(tb->hdr->nChroms * sizeof(uint64_t));
|
462
|
+
if(!idx->offset) goto error;
|
463
|
+
|
464
|
+
//Read in each chromosome/contig
|
465
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
466
|
+
if(twobitSeek(tb, tb->cl->offset[i]) != 0) goto error;
|
467
|
+
if(twobitRead(data, sizeof(uint32_t), 2, tb) != 2) goto error;
|
468
|
+
idx->size[i] = data[0];
|
469
|
+
idx->nBlockCount[i] = data[1];
|
470
|
+
|
471
|
+
//Allocate the nBlock starts/sizes and fill them in
|
472
|
+
idx->nBlockStart[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
|
473
|
+
idx->nBlockSizes[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
|
474
|
+
if(!idx->nBlockStart[i]) goto error;
|
475
|
+
if(!idx->nBlockSizes[i]) goto error;
|
476
|
+
if(twobitRead(idx->nBlockStart[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
|
477
|
+
if(twobitRead(idx->nBlockSizes[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
|
478
|
+
|
479
|
+
//Get the masked block information
|
480
|
+
if(twobitRead(idx->maskBlockCount + i, sizeof(uint32_t), 1, tb) != 1) goto error;
|
481
|
+
|
482
|
+
//Allocate the maskBlock starts/sizes and fill them in
|
483
|
+
if(storeMasked) {
|
484
|
+
idx->maskBlockStart[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
|
485
|
+
idx->maskBlockSizes[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
|
486
|
+
if(!idx->maskBlockStart[i]) goto error;
|
487
|
+
if(!idx->maskBlockSizes[i]) goto error;
|
488
|
+
if(twobitRead(idx->maskBlockStart[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
|
489
|
+
if(twobitRead(idx->maskBlockSizes[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
|
490
|
+
} else {
|
491
|
+
if(twobitSeek(tb, twobitTell(tb) + 8 * idx->maskBlockCount[i]) != 0) goto error;
|
492
|
+
}
|
493
|
+
|
494
|
+
//Reserved
|
495
|
+
if(twobitRead(data, sizeof(uint32_t), 1, tb) != 1) goto error;
|
496
|
+
|
497
|
+
idx->offset[i] = twobitTell(tb);
|
498
|
+
}
|
499
|
+
|
500
|
+
tb->idx = idx;
|
501
|
+
return;
|
502
|
+
|
503
|
+
error:
|
504
|
+
if(idx) {
|
505
|
+
if(idx->size) free(idx->size);
|
506
|
+
|
507
|
+
if(idx->nBlockCount) free(idx->nBlockCount);
|
508
|
+
if(idx->nBlockStart) {
|
509
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
510
|
+
if(idx->nBlockStart[i]) free(idx->nBlockStart[i]);
|
511
|
+
}
|
512
|
+
free(idx->nBlockStart[i]);
|
513
|
+
}
|
514
|
+
if(idx->nBlockSizes) {
|
515
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
516
|
+
if(idx->nBlockSizes[i]) free(idx->nBlockSizes[i]);
|
517
|
+
}
|
518
|
+
free(idx->nBlockSizes[i]);
|
519
|
+
}
|
520
|
+
|
521
|
+
if(idx->maskBlockCount) free(idx->maskBlockCount);
|
522
|
+
if(idx->maskBlockStart) {
|
523
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
524
|
+
if(idx->maskBlockStart[i]) free(idx->maskBlockStart[i]);
|
525
|
+
}
|
526
|
+
free(idx->maskBlockStart[i]);
|
527
|
+
}
|
528
|
+
if(idx->maskBlockSizes) {
|
529
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
530
|
+
if(idx->maskBlockSizes[i]) free(idx->maskBlockSizes[i]);
|
531
|
+
}
|
532
|
+
free(idx->maskBlockSizes[i]);
|
533
|
+
}
|
534
|
+
|
535
|
+
if(idx->offset) free(idx->offset);
|
536
|
+
|
537
|
+
free(idx);
|
538
|
+
}
|
539
|
+
}
|
540
|
+
|
541
|
+
void twobitIndexDestroy(TwoBit *tb) {
|
542
|
+
uint32_t i;
|
543
|
+
|
544
|
+
if(tb->idx) {
|
545
|
+
if(tb->idx->size) free(tb->idx->size);
|
546
|
+
|
547
|
+
if(tb->idx->nBlockCount) free(tb->idx->nBlockCount);
|
548
|
+
if(tb->idx->nBlockStart) {
|
549
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
550
|
+
if(tb->idx->nBlockStart[i]) free(tb->idx->nBlockStart[i]);
|
551
|
+
}
|
552
|
+
free(tb->idx->nBlockStart);
|
553
|
+
}
|
554
|
+
if(tb->idx->nBlockSizes) {
|
555
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
556
|
+
if(tb->idx->nBlockSizes[i]) free(tb->idx->nBlockSizes[i]);
|
557
|
+
}
|
558
|
+
free(tb->idx->nBlockSizes);
|
559
|
+
}
|
560
|
+
|
561
|
+
if(tb->idx->maskBlockCount) free(tb->idx->maskBlockCount);
|
562
|
+
if(tb->idx->maskBlockStart) {
|
563
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
564
|
+
if(tb->idx->maskBlockStart[i]) free(tb->idx->maskBlockStart[i]);
|
565
|
+
}
|
566
|
+
free(tb->idx->maskBlockStart);
|
567
|
+
}
|
568
|
+
if(tb->idx->maskBlockSizes) {
|
569
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
570
|
+
if(tb->idx->maskBlockSizes[i]) free(tb->idx->maskBlockSizes[i]);
|
571
|
+
}
|
572
|
+
free(tb->idx->maskBlockSizes);
|
573
|
+
}
|
574
|
+
|
575
|
+
if(tb->idx->offset) free(tb->idx->offset);
|
576
|
+
|
577
|
+
free(tb->idx);
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
void twobitChromListRead(TwoBit *tb) {
|
582
|
+
uint32_t i;
|
583
|
+
uint8_t byte;
|
584
|
+
char *str = NULL;
|
585
|
+
TwoBitCL *cl = calloc(1, sizeof(TwoBitCL));
|
586
|
+
|
587
|
+
//Allocate cl and do error checking
|
588
|
+
if(!cl) goto error;
|
589
|
+
cl->chrom = calloc(tb->hdr->nChroms, sizeof(char*));
|
590
|
+
cl->offset = malloc(sizeof(uint32_t) * tb->hdr->nChroms);
|
591
|
+
if(!cl->chrom) goto error;
|
592
|
+
if(!cl->offset) goto error;
|
593
|
+
|
594
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
595
|
+
//Get the string size (not null terminated!)
|
596
|
+
if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
|
597
|
+
|
598
|
+
//Read in the string
|
599
|
+
str = calloc(1 + byte, sizeof(char));
|
600
|
+
if(!str) goto error;
|
601
|
+
if(twobitRead(str, 1, byte, tb) != byte) goto error;
|
602
|
+
cl->chrom[i] = str;
|
603
|
+
str = NULL;
|
604
|
+
|
605
|
+
//Read in the size
|
606
|
+
if(twobitRead(cl->offset + i, sizeof(uint32_t), 1, tb) != 1) goto error;
|
607
|
+
}
|
608
|
+
|
609
|
+
tb->cl = cl;
|
610
|
+
return;
|
611
|
+
|
612
|
+
error:
|
613
|
+
if(str) free(str);
|
614
|
+
if(cl) {
|
615
|
+
if(cl->offset) free(cl->offset);
|
616
|
+
if(cl->chrom) {
|
617
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
618
|
+
if(cl->chrom[i]) free(cl->chrom[i]);
|
619
|
+
}
|
620
|
+
free(cl->chrom);
|
621
|
+
}
|
622
|
+
free(cl);
|
623
|
+
}
|
624
|
+
}
|
625
|
+
|
626
|
+
void twobitChromListDestroy(TwoBit *tb) {
|
627
|
+
uint32_t i;
|
628
|
+
|
629
|
+
if(tb->cl) {
|
630
|
+
if(tb->cl->offset) free(tb->cl->offset);
|
631
|
+
if(tb->cl->chrom) {
|
632
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
633
|
+
if(tb->cl->chrom[i]) free(tb->cl->chrom[i]);
|
634
|
+
}
|
635
|
+
free(tb->cl->chrom);
|
636
|
+
}
|
637
|
+
free(tb->cl);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
|
641
|
+
void twobitHdrRead(TwoBit *tb) {
|
642
|
+
//Read the first 16 bytes
|
643
|
+
uint32_t data[4];
|
644
|
+
TwoBitHeader *hdr = calloc(1, sizeof(TwoBitHeader));
|
645
|
+
|
646
|
+
if(!hdr) return;
|
647
|
+
|
648
|
+
if(twobitRead(data, 4, 4, tb) != 4) goto error;
|
649
|
+
|
650
|
+
//Magic
|
651
|
+
hdr->magic = data[0];
|
652
|
+
if(hdr->magic != 0x1A412743) {
|
653
|
+
fprintf(stderr, "[twobitHdrRead] Received an invalid file magic number (0x%"PRIx32")!\n", hdr->magic);
|
654
|
+
goto error;
|
655
|
+
}
|
656
|
+
|
657
|
+
//Version
|
658
|
+
hdr->version = data[1];
|
659
|
+
if(hdr->version != 0) {
|
660
|
+
fprintf(stderr, "[twobitHdrRead] The file version is %"PRIu32" while only version 0 is defined!\n", hdr->version);
|
661
|
+
goto error;
|
662
|
+
}
|
663
|
+
|
664
|
+
//Sequence Count
|
665
|
+
hdr->nChroms = data[2];
|
666
|
+
if(hdr->nChroms == 0) {
|
667
|
+
fprintf(stderr, "[twobitHdrRead] There are apparently no chromosomes/contigs in this file!\n");
|
668
|
+
goto error;
|
669
|
+
}
|
670
|
+
|
671
|
+
tb->hdr = hdr;
|
672
|
+
return;
|
673
|
+
|
674
|
+
error:
|
675
|
+
if(hdr) free(hdr);
|
676
|
+
}
|
677
|
+
|
678
|
+
void twobitHdrDestroy(TwoBit *tb) {
|
679
|
+
if(tb->hdr) free(tb->hdr);
|
680
|
+
}
|
681
|
+
|
682
|
+
void twobitClose(TwoBit *tb) {
|
683
|
+
if(tb) {
|
684
|
+
if(tb->fp) fclose(tb->fp);
|
685
|
+
if(tb->data) munmap(tb->data, tb->sz);
|
686
|
+
twobitChromListDestroy(tb);
|
687
|
+
twobitIndexDestroy(tb);
|
688
|
+
//N.B., this needs to be called last
|
689
|
+
twobitHdrDestroy(tb);
|
690
|
+
free(tb);
|
691
|
+
}
|
692
|
+
}
|
693
|
+
|
694
|
+
TwoBit* twobitOpen(char *fname, int storeMasked) {
|
695
|
+
int fd;
|
696
|
+
struct stat fs;
|
697
|
+
TwoBit *tb = calloc(1, sizeof(TwoBit));
|
698
|
+
if(!tb) return NULL;
|
699
|
+
|
700
|
+
tb->fp = fopen(fname, "rb");
|
701
|
+
if(!tb->fp) goto error;
|
702
|
+
|
703
|
+
//Try to memory map the whole thing, since these aren't terribly large
|
704
|
+
//Since we might be multithreading this in python, use shared memory
|
705
|
+
fd = fileno(tb->fp);
|
706
|
+
if(fstat(fd, &fs) == 0) {
|
707
|
+
tb->sz = (uint64_t) fs.st_size;
|
708
|
+
tb->data = mmap(NULL, fs.st_size, PROT_READ, MAP_SHARED, fd, 0);
|
709
|
+
if(tb->data) {
|
710
|
+
if(madvise(tb->data, fs.st_size, MADV_RANDOM) != 0) {
|
711
|
+
munmap(tb->data, fs.st_size);
|
712
|
+
tb->data = NULL;
|
713
|
+
}
|
714
|
+
}
|
715
|
+
}
|
716
|
+
|
717
|
+
//Attempt to read in the fixed header
|
718
|
+
twobitHdrRead(tb);
|
719
|
+
if(!tb->hdr) goto error;
|
720
|
+
|
721
|
+
//Read in the chromosome list
|
722
|
+
twobitChromListRead(tb);
|
723
|
+
if(!tb->cl) goto error;
|
724
|
+
|
725
|
+
//Read in the mask index
|
726
|
+
twobitIndexRead(tb, storeMasked);
|
727
|
+
if(!tb->idx) goto error;
|
728
|
+
|
729
|
+
return tb;
|
730
|
+
|
731
|
+
error:
|
732
|
+
twobitClose(tb);
|
733
|
+
return NULL;
|
734
|
+
}
|