bio-twobit 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,734 @@
1
+ #include <sys/types.h>
2
+ #include <sys/stat.h>
3
+ #include <sys/mman.h>
4
+ #include <unistd.h>
5
+ #include <inttypes.h>
6
+ #include <stdio.h>
7
+ #include <stdlib.h>
8
+ #include <string.h>
9
+ #include <ctype.h>
10
+ #include "2bit.h"
11
+
12
+ uint64_t twobitTell(TwoBit *tb);
13
+
14
+ /*
15
+ Read nmemb elements, each of size sz from the current file offset
16
+ into data. Return the number of elements read. On error, the return
17
+ value is either 0 or less than nmemb
18
+ */
19
+ size_t twobitRead(void *data, size_t sz, size_t nmemb, TwoBit *tb) {
20
+ if(tb->data) {
21
+ if(memcpy(data, tb->data + tb->offset, nmemb * sz) == NULL) return 0;
22
+ tb->offset += nmemb * sz;
23
+ return nmemb;
24
+ } else {
25
+ return fread(data, sz, nmemb, tb->fp);
26
+ }
27
+ }
28
+
29
+ /*
30
+ Seek to a specific position, which is essentially trivial for memmaped stuff
31
+
32
+ Returns: 0 on success, -1 on error
33
+ */
34
+ int twobitSeek(TwoBit *tb, uint64_t offset) {
35
+ if(offset >= tb->sz) return -1;
36
+ if(tb->data) {
37
+ tb->offset = offset;
38
+ return 0;
39
+ } else {
40
+ return fseek(tb->fp, (long) offset, SEEK_SET);
41
+ }
42
+ }
43
+
44
+ /*
45
+ Like ftell, but generalized to handle memmaped files
46
+
47
+ Returns the offset
48
+ */
49
+ uint64_t twobitTell(TwoBit *tb) {
50
+ if(tb->data) return tb->offset;
51
+ return (uint64_t) ftell(tb->fp);
52
+ }
53
+
54
+ /*
55
+ Given a byte containing 4 bases, return the character representation of the offset'th base
56
+ */
57
+ char byte2base(uint8_t byte, int offset) {
58
+ int rev = 3 - offset;
59
+ uint8_t mask = 3 << (2 * rev);
60
+ int foo = (mask & byte) >> (2 * rev);
61
+ char bases[4] = "TCAG";
62
+ return bases[foo];
63
+ }
64
+
65
+ void bytes2bases(char *seq, uint8_t *byte, uint32_t sz, int offset) {
66
+ uint32_t pos = 0, remainder = 0, i = 0;
67
+ char bases[4] = "TCAG";
68
+ uint8_t foo = byte[0];
69
+
70
+ // Deal with the first partial byte
71
+ if(offset != 0) {
72
+ while(offset < 4) {
73
+ seq[pos++] = byte2base(foo, offset++);
74
+ }
75
+ if(pos >= sz) return;
76
+ foo = byte[++i];
77
+ }
78
+
79
+ // Deal with everything else, with the possible exception of the last fractional byte
80
+ remainder = (sz - pos) % 4;
81
+ while(pos < sz - remainder) {
82
+ foo = byte[i++];
83
+ seq[pos + 3] = bases[foo & 3];
84
+ foo >>= 2;
85
+ seq[pos + 2] = bases[foo & 3];
86
+ foo >>= 2;
87
+ seq[pos + 1] = bases[foo & 3];
88
+ foo >>= 2;
89
+ seq[pos] = bases[foo & 3];
90
+ foo >>= 2;
91
+ pos += 4;
92
+ }
93
+
94
+ // Deal with the last partial byte
95
+ if(remainder > 0) foo = byte[i];
96
+ for(offset=0; offset<remainder; offset++) {
97
+ seq[pos++] = byte2base(foo, offset);
98
+ }
99
+ }
100
+
101
+ /*
102
+ Replace Ts (or whatever else is being used) with N as appropriate
103
+ */
104
+ void NMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
105
+ uint32_t i, width, pos = 0;
106
+ uint32_t blockStart, blockEnd;
107
+
108
+ for(i=0; i<tb->idx->nBlockCount[tid]; i++) {
109
+ blockStart = tb->idx->nBlockStart[tid][i];
110
+ blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
111
+ if(blockEnd <= start) continue;
112
+ if(blockStart >= end) break;
113
+ if(blockStart < start) {
114
+ blockEnd = (blockEnd < end) ? blockEnd : end;
115
+ pos = 0;
116
+ width = blockEnd - start;
117
+ } else {
118
+ blockEnd = (blockEnd < end) ? blockEnd : end;
119
+ pos = blockStart - start;
120
+ width = blockEnd - blockStart;
121
+ }
122
+ width += pos;
123
+ for(; pos < width; pos++) seq[pos] = 'N';
124
+ }
125
+ }
126
+
127
+ /*
128
+ Replace uppercase with lower-case letters, if required
129
+ */
130
+ void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
131
+ uint32_t i, width, pos = 0;
132
+ uint32_t blockStart, blockEnd;
133
+
134
+ if(!tb->idx->maskBlockStart) return;
135
+
136
+ for(i=0; i<tb->idx->maskBlockCount[tid]; i++) {
137
+ blockStart = tb->idx->maskBlockStart[tid][i];
138
+ blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
139
+ if(blockEnd <= start) continue;
140
+ if(blockStart >= end) break;
141
+ if(blockStart < start) {
142
+ blockEnd = (blockEnd < end) ? blockEnd : end;
143
+ pos = 0;
144
+ width = blockEnd - start;
145
+ } else {
146
+ blockEnd = (blockEnd < end) ? blockEnd : end;
147
+ pos = blockStart - start;
148
+ width = blockEnd - blockStart;
149
+ }
150
+ width += pos;
151
+ for(; pos < width; pos++) {
152
+ if(seq[pos] != 'N') seq[pos] = tolower(seq[pos]);
153
+ }
154
+ }
155
+ }
156
+
157
+ /*
158
+ This is the worker function for twobitSequence, which mostly does error checking
159
+ */
160
+ char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
161
+ uint32_t sz = end - start + 1;
162
+ uint32_t blockStart, blockEnd;
163
+ char *seq = malloc(sz * sizeof(char));
164
+ uint8_t *bytes = NULL;
165
+ int offset;
166
+ if(!seq) return NULL;
167
+
168
+ //There are 4 bases/byte
169
+ blockStart = start/4;
170
+ offset = start % 4;
171
+ blockEnd = end/4 + ((end % 4) ? 1 : 0);
172
+ bytes = malloc(blockEnd - blockStart);
173
+ if(!bytes) goto error;
174
+
175
+ if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
176
+ if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
177
+ bytes2bases(seq, bytes, sz - 1, offset);
178
+ free(bytes);
179
+
180
+ //Null terminate the output
181
+ seq[sz - 1] = '\0';
182
+
183
+ //N-mask everything
184
+ NMask(seq, tb, tid, start, end);
185
+
186
+ //Soft-mask if requested
187
+ softMask(seq, tb, tid, start, end);
188
+
189
+ return seq;
190
+
191
+ error:
192
+ if(seq) free(seq);
193
+ if(bytes) free(bytes);
194
+ return NULL;
195
+ }
196
+
197
+ /*
198
+ Given a chromosome, name, and optional range, return the corresponding sequence.
199
+
200
+ The start and end or 0-based half-open, so end-start is the number of bases.
201
+ If both start and end are 0, then the whole chromosome is used.
202
+
203
+ On error (e.g., a missing chromosome), NULL is returned.
204
+ */
205
+ char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
206
+ uint32_t i, tid=0;
207
+
208
+ //Get the chromosome ID
209
+ for(i=0; i<tb->hdr->nChroms; i++) {
210
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) {
211
+ tid = i;
212
+ break;
213
+ }
214
+ }
215
+ if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
216
+
217
+ //Get the start/end if not specified
218
+ if(start == end && end == 0) {
219
+ end = tb->idx->size[tid];
220
+ }
221
+
222
+ //Sanity check the bounds
223
+ if(end > tb->idx->size[tid]) return NULL;
224
+ if(start >= end) return NULL;
225
+
226
+ return constructSequence(tb, tid, start, end);
227
+ }
228
+
229
+ /*
230
+ Given a tid and a position, set the various mask variables to an appropriate block of Ns.
231
+
232
+ * If maskIdx is not -1, these are set to the first overlapping block (or maskIdx is set to the number of N blocks).
233
+ * If maskIdx is not -1 then it's incremented and maskStart/maskEnd set appropriately.
234
+
235
+ If the returned interval doesn't overlap the start/end range, then both values will be -1.
236
+ */
237
+ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd) {
238
+ if(*maskIdx == (uint32_t) -1) {
239
+ for((*maskIdx)=0; (*maskIdx)<tb->idx->nBlockCount[tid]; (*maskIdx)++) {
240
+ *maskStart = tb->idx->nBlockStart[tid][*maskIdx];
241
+ *maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
242
+ if(*maskEnd < start) continue;
243
+ if(*maskEnd >= start) break;
244
+ }
245
+ } else if(*maskIdx >= tb->idx->nBlockCount[tid]) {
246
+ *maskStart = (uint32_t) -1;
247
+ *maskEnd = (uint32_t) -1;
248
+ } else {
249
+ *maskIdx += 1;
250
+ if(*maskIdx >= tb->idx->nBlockCount[tid]) {
251
+ *maskStart = (uint32_t) -1;
252
+ *maskEnd = (uint32_t) -1;
253
+ } else {
254
+ *maskStart = tb->idx->nBlockStart[tid][*maskIdx];
255
+ *maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
256
+ }
257
+ }
258
+
259
+ //maskStart = maskEnd = -1 if no overlap
260
+ if(*maskIdx >= tb->idx->nBlockCount[tid] || *maskStart >= end) {
261
+ *maskStart = (uint32_t) -1;
262
+ *maskEnd = (uint32_t) -1;
263
+ }
264
+ }
265
+
266
+ uint8_t getByteMaskFromOffset(int offset) {
267
+ switch(offset) {
268
+ case 0:
269
+ return (uint8_t) 15;
270
+ case 1:
271
+ return (uint8_t) 7;
272
+ case 2:
273
+ return (uint8_t) 3;
274
+ }
275
+ return 1;
276
+ }
277
+
278
+ void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
279
+ void *out;
280
+ uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
281
+ uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
282
+ uint8_t *bytes = NULL, mask = 0, offset;
283
+
284
+ if(fraction) {
285
+ out = malloc(4 * sizeof(double));
286
+ } else {
287
+ out = malloc(4 * sizeof(uint32_t));
288
+ }
289
+ if(!out) return NULL;
290
+
291
+ //There are 4 bases/byte
292
+ blockStart = start/4;
293
+ offset = start % 4;
294
+ blockEnd = end/4 + ((end % 4) ? 1 : 0);
295
+ bytes = malloc(blockEnd - blockStart);
296
+ if(!bytes) goto error;
297
+
298
+ //Set the initial mask, reset start/offset so we always deal with full bytes
299
+ mask = getByteMaskFromOffset(offset);
300
+ start = 4 * blockStart;
301
+ offset = 0;
302
+
303
+ if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
304
+ if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
305
+
306
+ //Get the index/start/end of the next N-mask block
307
+ getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
308
+
309
+ while(i < len) {
310
+ // Check if we need to jump
311
+ if(maskIdx != -1 && start + i + 4 >= maskStart) {
312
+ if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
313
+ //Jump iff the whole byte is inside an N block
314
+ if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
315
+ //iff we're fully in an N block then jump
316
+ i = maskEnd - start;
317
+ getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
318
+ offset = (start + i) % 4;
319
+ j = i / 4;
320
+ mask = getByteMaskFromOffset(offset);
321
+ i = 4 * j; //Now that the mask has been set, reset i to byte offsets
322
+ offset = 0;
323
+ continue;
324
+ }
325
+
326
+ //Set the mask, if appropriate
327
+ foo = 4*j + 4*blockStart; // The smallest position in the byte
328
+ if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
329
+ if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
330
+ if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
331
+ if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
332
+ if(foo + 4 > maskEnd) {
333
+ getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
334
+ continue;
335
+ }
336
+ }
337
+ }
338
+
339
+ //Ensure that anything after then end is masked
340
+ if(i+4>=len) {
341
+ if((mask & 1) && i+3>=len) mask -=1;
342
+ if((mask & 2) && i+2>=len) mask -=2;
343
+ if((mask & 4) && i+1>=len) mask -=4;
344
+ if((mask & 8) && i>=len) mask -=8;
345
+ }
346
+
347
+ foo = bytes[j++];
348
+ //Offset 3
349
+ if(mask & 1) {
350
+ tmp[foo & 3]++;
351
+ }
352
+ foo >>= 2;
353
+ mask >>= 1;
354
+ //Offset 2
355
+ if(mask & 1) {
356
+ tmp[foo & 3]++;
357
+ }
358
+ foo >>= 2;
359
+ mask >>= 1;
360
+ //Offset 1
361
+ if(mask & 1) {
362
+ tmp[foo & 3]++;
363
+ }
364
+ foo >>= 2;
365
+ mask >>= 1;
366
+ //Offset 0
367
+ if(mask & 1) {
368
+ tmp[foo & 3]++; // offset 0
369
+ }
370
+ i += 4;
371
+ mask = 15;
372
+ }
373
+ free(bytes);
374
+
375
+ //out is in TCAG order, since that's how 2bit is stored.
376
+ //However, for whatever reason I went with ACTG in the first release...
377
+ if(fraction) {
378
+ ((double*) out)[0] = ((double) tmp[2])/((double) len);
379
+ ((double*) out)[1] = ((double) tmp[1])/((double) len);
380
+ ((double*) out)[2] = ((double) tmp[0])/((double) len);
381
+ ((double*) out)[3] = ((double) tmp[3])/((double) len);
382
+ } else {
383
+ ((uint32_t*) out)[0] = tmp[2];
384
+ ((uint32_t*) out)[1] = tmp[1];
385
+ ((uint32_t*) out)[2] = tmp[0];
386
+ ((uint32_t*) out)[3] = tmp[3];
387
+ }
388
+
389
+ return out;
390
+
391
+ error:
392
+ if(out) free(out);
393
+ if(bytes) free(bytes);
394
+ return NULL;
395
+ }
396
+
397
+ void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction) {
398
+ uint32_t tid = 0, i;
399
+
400
+ //Get the chromosome ID
401
+ for(i=0; i<tb->hdr->nChroms; i++) {
402
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) {
403
+ tid = i;
404
+ break;
405
+ }
406
+ }
407
+
408
+ if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
409
+
410
+ //Get the start/end if not specified
411
+ if(start == end && end == 0) {
412
+ end = tb->idx->size[tid];
413
+ }
414
+
415
+ //Sanity check the bounds
416
+ if(end > tb->idx->size[tid]) return NULL;
417
+ if(start >= end) return NULL;
418
+
419
+ return twobitBasesWorker(tb, tid, start, end, fraction);
420
+ }
421
+
422
+ /*
423
+ Given a chromosome, chrom, return it's length. 0 is used if the chromosome isn't present.
424
+ */
425
+ uint32_t twobitChromLen(TwoBit *tb, char *chrom) {
426
+ uint32_t i;
427
+ for(i=0; i<tb->hdr->nChroms; i++) {
428
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) return tb->idx->size[i];
429
+ }
430
+ return 0;
431
+ }
432
+
433
+ /*
434
+ Fill in tb->idx.
435
+
436
+ Note that the masked stuff will only be stored if storeMasked == 1, since it uses gobs of memory otherwise.
437
+ On error, tb->idx is left as NULL.
438
+ */
439
+ void twobitIndexRead(TwoBit *tb, int storeMasked) {
440
+ uint32_t i, data[2];
441
+ TwoBitMaskedIdx *idx = calloc(1, sizeof(TwoBitMaskedIdx));
442
+
443
+ //Allocation and error checking
444
+ if(!idx) return;
445
+ idx->size = malloc(tb->hdr->nChroms * sizeof(uint32_t));
446
+ idx->nBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
447
+ idx->nBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
448
+ idx->nBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
449
+ if(!idx->size) goto error;
450
+ if(!idx->nBlockCount) goto error;
451
+ if(!idx->nBlockStart) goto error;
452
+ if(!idx->nBlockSizes) goto error;
453
+ idx->maskBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
454
+ if(!idx->maskBlockCount) goto error;
455
+ if(storeMasked) {
456
+ idx->maskBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
457
+ idx->maskBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
458
+ if(!idx->maskBlockStart) goto error;
459
+ if(!idx->maskBlockSizes) goto error;
460
+ }
461
+ idx->offset = malloc(tb->hdr->nChroms * sizeof(uint64_t));
462
+ if(!idx->offset) goto error;
463
+
464
+ //Read in each chromosome/contig
465
+ for(i=0; i<tb->hdr->nChroms; i++) {
466
+ if(twobitSeek(tb, tb->cl->offset[i]) != 0) goto error;
467
+ if(twobitRead(data, sizeof(uint32_t), 2, tb) != 2) goto error;
468
+ idx->size[i] = data[0];
469
+ idx->nBlockCount[i] = data[1];
470
+
471
+ //Allocate the nBlock starts/sizes and fill them in
472
+ idx->nBlockStart[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
473
+ idx->nBlockSizes[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
474
+ if(!idx->nBlockStart[i]) goto error;
475
+ if(!idx->nBlockSizes[i]) goto error;
476
+ if(twobitRead(idx->nBlockStart[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
477
+ if(twobitRead(idx->nBlockSizes[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
478
+
479
+ //Get the masked block information
480
+ if(twobitRead(idx->maskBlockCount + i, sizeof(uint32_t), 1, tb) != 1) goto error;
481
+
482
+ //Allocate the maskBlock starts/sizes and fill them in
483
+ if(storeMasked) {
484
+ idx->maskBlockStart[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
485
+ idx->maskBlockSizes[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
486
+ if(!idx->maskBlockStart[i]) goto error;
487
+ if(!idx->maskBlockSizes[i]) goto error;
488
+ if(twobitRead(idx->maskBlockStart[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
489
+ if(twobitRead(idx->maskBlockSizes[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
490
+ } else {
491
+ if(twobitSeek(tb, twobitTell(tb) + 8 * idx->maskBlockCount[i]) != 0) goto error;
492
+ }
493
+
494
+ //Reserved
495
+ if(twobitRead(data, sizeof(uint32_t), 1, tb) != 1) goto error;
496
+
497
+ idx->offset[i] = twobitTell(tb);
498
+ }
499
+
500
+ tb->idx = idx;
501
+ return;
502
+
503
+ error:
504
+ if(idx) {
505
+ if(idx->size) free(idx->size);
506
+
507
+ if(idx->nBlockCount) free(idx->nBlockCount);
508
+ if(idx->nBlockStart) {
509
+ for(i=0; i<tb->hdr->nChroms; i++) {
510
+ if(idx->nBlockStart[i]) free(idx->nBlockStart[i]);
511
+ }
512
+ free(idx->nBlockStart[i]);
513
+ }
514
+ if(idx->nBlockSizes) {
515
+ for(i=0; i<tb->hdr->nChroms; i++) {
516
+ if(idx->nBlockSizes[i]) free(idx->nBlockSizes[i]);
517
+ }
518
+ free(idx->nBlockSizes[i]);
519
+ }
520
+
521
+ if(idx->maskBlockCount) free(idx->maskBlockCount);
522
+ if(idx->maskBlockStart) {
523
+ for(i=0; i<tb->hdr->nChroms; i++) {
524
+ if(idx->maskBlockStart[i]) free(idx->maskBlockStart[i]);
525
+ }
526
+ free(idx->maskBlockStart[i]);
527
+ }
528
+ if(idx->maskBlockSizes) {
529
+ for(i=0; i<tb->hdr->nChroms; i++) {
530
+ if(idx->maskBlockSizes[i]) free(idx->maskBlockSizes[i]);
531
+ }
532
+ free(idx->maskBlockSizes[i]);
533
+ }
534
+
535
+ if(idx->offset) free(idx->offset);
536
+
537
+ free(idx);
538
+ }
539
+ }
540
+
541
+ void twobitIndexDestroy(TwoBit *tb) {
542
+ uint32_t i;
543
+
544
+ if(tb->idx) {
545
+ if(tb->idx->size) free(tb->idx->size);
546
+
547
+ if(tb->idx->nBlockCount) free(tb->idx->nBlockCount);
548
+ if(tb->idx->nBlockStart) {
549
+ for(i=0; i<tb->hdr->nChroms; i++) {
550
+ if(tb->idx->nBlockStart[i]) free(tb->idx->nBlockStart[i]);
551
+ }
552
+ free(tb->idx->nBlockStart);
553
+ }
554
+ if(tb->idx->nBlockSizes) {
555
+ for(i=0; i<tb->hdr->nChroms; i++) {
556
+ if(tb->idx->nBlockSizes[i]) free(tb->idx->nBlockSizes[i]);
557
+ }
558
+ free(tb->idx->nBlockSizes);
559
+ }
560
+
561
+ if(tb->idx->maskBlockCount) free(tb->idx->maskBlockCount);
562
+ if(tb->idx->maskBlockStart) {
563
+ for(i=0; i<tb->hdr->nChroms; i++) {
564
+ if(tb->idx->maskBlockStart[i]) free(tb->idx->maskBlockStart[i]);
565
+ }
566
+ free(tb->idx->maskBlockStart);
567
+ }
568
+ if(tb->idx->maskBlockSizes) {
569
+ for(i=0; i<tb->hdr->nChroms; i++) {
570
+ if(tb->idx->maskBlockSizes[i]) free(tb->idx->maskBlockSizes[i]);
571
+ }
572
+ free(tb->idx->maskBlockSizes);
573
+ }
574
+
575
+ if(tb->idx->offset) free(tb->idx->offset);
576
+
577
+ free(tb->idx);
578
+ }
579
+ }
580
+
581
+ void twobitChromListRead(TwoBit *tb) {
582
+ uint32_t i;
583
+ uint8_t byte;
584
+ char *str = NULL;
585
+ TwoBitCL *cl = calloc(1, sizeof(TwoBitCL));
586
+
587
+ //Allocate cl and do error checking
588
+ if(!cl) goto error;
589
+ cl->chrom = calloc(tb->hdr->nChroms, sizeof(char*));
590
+ cl->offset = malloc(sizeof(uint32_t) * tb->hdr->nChroms);
591
+ if(!cl->chrom) goto error;
592
+ if(!cl->offset) goto error;
593
+
594
+ for(i=0; i<tb->hdr->nChroms; i++) {
595
+ //Get the string size (not null terminated!)
596
+ if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
597
+
598
+ //Read in the string
599
+ str = calloc(1 + byte, sizeof(char));
600
+ if(!str) goto error;
601
+ if(twobitRead(str, 1, byte, tb) != byte) goto error;
602
+ cl->chrom[i] = str;
603
+ str = NULL;
604
+
605
+ //Read in the size
606
+ if(twobitRead(cl->offset + i, sizeof(uint32_t), 1, tb) != 1) goto error;
607
+ }
608
+
609
+ tb->cl = cl;
610
+ return;
611
+
612
+ error:
613
+ if(str) free(str);
614
+ if(cl) {
615
+ if(cl->offset) free(cl->offset);
616
+ if(cl->chrom) {
617
+ for(i=0; i<tb->hdr->nChroms; i++) {
618
+ if(cl->chrom[i]) free(cl->chrom[i]);
619
+ }
620
+ free(cl->chrom);
621
+ }
622
+ free(cl);
623
+ }
624
+ }
625
+
626
+ void twobitChromListDestroy(TwoBit *tb) {
627
+ uint32_t i;
628
+
629
+ if(tb->cl) {
630
+ if(tb->cl->offset) free(tb->cl->offset);
631
+ if(tb->cl->chrom) {
632
+ for(i=0; i<tb->hdr->nChroms; i++) {
633
+ if(tb->cl->chrom[i]) free(tb->cl->chrom[i]);
634
+ }
635
+ free(tb->cl->chrom);
636
+ }
637
+ free(tb->cl);
638
+ }
639
+ }
640
+
641
+ void twobitHdrRead(TwoBit *tb) {
642
+ //Read the first 16 bytes
643
+ uint32_t data[4];
644
+ TwoBitHeader *hdr = calloc(1, sizeof(TwoBitHeader));
645
+
646
+ if(!hdr) return;
647
+
648
+ if(twobitRead(data, 4, 4, tb) != 4) goto error;
649
+
650
+ //Magic
651
+ hdr->magic = data[0];
652
+ if(hdr->magic != 0x1A412743) {
653
+ fprintf(stderr, "[twobitHdrRead] Received an invalid file magic number (0x%"PRIx32")!\n", hdr->magic);
654
+ goto error;
655
+ }
656
+
657
+ //Version
658
+ hdr->version = data[1];
659
+ if(hdr->version != 0) {
660
+ fprintf(stderr, "[twobitHdrRead] The file version is %"PRIu32" while only version 0 is defined!\n", hdr->version);
661
+ goto error;
662
+ }
663
+
664
+ //Sequence Count
665
+ hdr->nChroms = data[2];
666
+ if(hdr->nChroms == 0) {
667
+ fprintf(stderr, "[twobitHdrRead] There are apparently no chromosomes/contigs in this file!\n");
668
+ goto error;
669
+ }
670
+
671
+ tb->hdr = hdr;
672
+ return;
673
+
674
+ error:
675
+ if(hdr) free(hdr);
676
+ }
677
+
678
+ void twobitHdrDestroy(TwoBit *tb) {
679
+ if(tb->hdr) free(tb->hdr);
680
+ }
681
+
682
+ void twobitClose(TwoBit *tb) {
683
+ if(tb) {
684
+ if(tb->fp) fclose(tb->fp);
685
+ if(tb->data) munmap(tb->data, tb->sz);
686
+ twobitChromListDestroy(tb);
687
+ twobitIndexDestroy(tb);
688
+ //N.B., this needs to be called last
689
+ twobitHdrDestroy(tb);
690
+ free(tb);
691
+ }
692
+ }
693
+
694
+ TwoBit* twobitOpen(char *fname, int storeMasked) {
695
+ int fd;
696
+ struct stat fs;
697
+ TwoBit *tb = calloc(1, sizeof(TwoBit));
698
+ if(!tb) return NULL;
699
+
700
+ tb->fp = fopen(fname, "rb");
701
+ if(!tb->fp) goto error;
702
+
703
+ //Try to memory map the whole thing, since these aren't terribly large
704
+ //Since we might be multithreading this in python, use shared memory
705
+ fd = fileno(tb->fp);
706
+ if(fstat(fd, &fs) == 0) {
707
+ tb->sz = (uint64_t) fs.st_size;
708
+ tb->data = mmap(NULL, fs.st_size, PROT_READ, MAP_SHARED, fd, 0);
709
+ if(tb->data) {
710
+ if(madvise(tb->data, fs.st_size, MADV_RANDOM) != 0) {
711
+ munmap(tb->data, fs.st_size);
712
+ tb->data = NULL;
713
+ }
714
+ }
715
+ }
716
+
717
+ //Attempt to read in the fixed header
718
+ twobitHdrRead(tb);
719
+ if(!tb->hdr) goto error;
720
+
721
+ //Read in the chromosome list
722
+ twobitChromListRead(tb);
723
+ if(!tb->cl) goto error;
724
+
725
+ //Read in the mask index
726
+ twobitIndexRead(tb, storeMasked);
727
+ if(!tb->idx) goto error;
728
+
729
+ return tb;
730
+
731
+ error:
732
+ twobitClose(tb);
733
+ return NULL;
734
+ }