bio-twobit 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,734 @@
1
+ #include <sys/types.h>
2
+ #include <sys/stat.h>
3
+ #include <sys/mman.h>
4
+ #include <unistd.h>
5
+ #include <inttypes.h>
6
+ #include <stdio.h>
7
+ #include <stdlib.h>
8
+ #include <string.h>
9
+ #include <ctype.h>
10
+ #include "2bit.h"
11
+
12
+ uint64_t twobitTell(TwoBit *tb);
13
+
14
+ /*
15
+ Read nmemb elements, each of size sz from the current file offset
16
+ into data. Return the number of elements read. On error, the return
17
+ value is either 0 or less than nmemb
18
+ */
19
+ size_t twobitRead(void *data, size_t sz, size_t nmemb, TwoBit *tb) {
20
+ if(tb->data) {
21
+ if(memcpy(data, tb->data + tb->offset, nmemb * sz) == NULL) return 0;
22
+ tb->offset += nmemb * sz;
23
+ return nmemb;
24
+ } else {
25
+ return fread(data, sz, nmemb, tb->fp);
26
+ }
27
+ }
28
+
29
+ /*
30
+ Seek to a specific position, which is essentially trivial for memmaped stuff
31
+
32
+ Returns: 0 on success, -1 on error
33
+ */
34
+ int twobitSeek(TwoBit *tb, uint64_t offset) {
35
+ if(offset >= tb->sz) return -1;
36
+ if(tb->data) {
37
+ tb->offset = offset;
38
+ return 0;
39
+ } else {
40
+ return fseek(tb->fp, (long) offset, SEEK_SET);
41
+ }
42
+ }
43
+
44
+ /*
45
+ Like ftell, but generalized to handle memmaped files
46
+
47
+ Returns the offset
48
+ */
49
+ uint64_t twobitTell(TwoBit *tb) {
50
+ if(tb->data) return tb->offset;
51
+ return (uint64_t) ftell(tb->fp);
52
+ }
53
+
54
+ /*
55
+ Given a byte containing 4 bases, return the character representation of the offset'th base
56
+ */
57
+ char byte2base(uint8_t byte, int offset) {
58
+ int rev = 3 - offset;
59
+ uint8_t mask = 3 << (2 * rev);
60
+ int foo = (mask & byte) >> (2 * rev);
61
+ char bases[4] = "TCAG";
62
+ return bases[foo];
63
+ }
64
+
65
+ void bytes2bases(char *seq, uint8_t *byte, uint32_t sz, int offset) {
66
+ uint32_t pos = 0, remainder = 0, i = 0;
67
+ char bases[4] = "TCAG";
68
+ uint8_t foo = byte[0];
69
+
70
+ // Deal with the first partial byte
71
+ if(offset != 0) {
72
+ while(offset < 4) {
73
+ seq[pos++] = byte2base(foo, offset++);
74
+ }
75
+ if(pos >= sz) return;
76
+ foo = byte[++i];
77
+ }
78
+
79
+ // Deal with everything else, with the possible exception of the last fractional byte
80
+ remainder = (sz - pos) % 4;
81
+ while(pos < sz - remainder) {
82
+ foo = byte[i++];
83
+ seq[pos + 3] = bases[foo & 3];
84
+ foo >>= 2;
85
+ seq[pos + 2] = bases[foo & 3];
86
+ foo >>= 2;
87
+ seq[pos + 1] = bases[foo & 3];
88
+ foo >>= 2;
89
+ seq[pos] = bases[foo & 3];
90
+ foo >>= 2;
91
+ pos += 4;
92
+ }
93
+
94
+ // Deal with the last partial byte
95
+ if(remainder > 0) foo = byte[i];
96
+ for(offset=0; offset<remainder; offset++) {
97
+ seq[pos++] = byte2base(foo, offset);
98
+ }
99
+ }
100
+
101
+ /*
102
+ Replace Ts (or whatever else is being used) with N as appropriate
103
+ */
104
+ void NMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
105
+ uint32_t i, width, pos = 0;
106
+ uint32_t blockStart, blockEnd;
107
+
108
+ for(i=0; i<tb->idx->nBlockCount[tid]; i++) {
109
+ blockStart = tb->idx->nBlockStart[tid][i];
110
+ blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
111
+ if(blockEnd <= start) continue;
112
+ if(blockStart >= end) break;
113
+ if(blockStart < start) {
114
+ blockEnd = (blockEnd < end) ? blockEnd : end;
115
+ pos = 0;
116
+ width = blockEnd - start;
117
+ } else {
118
+ blockEnd = (blockEnd < end) ? blockEnd : end;
119
+ pos = blockStart - start;
120
+ width = blockEnd - blockStart;
121
+ }
122
+ width += pos;
123
+ for(; pos < width; pos++) seq[pos] = 'N';
124
+ }
125
+ }
126
+
127
+ /*
128
+ Replace uppercase with lower-case letters, if required
129
+ */
130
+ void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
131
+ uint32_t i, width, pos = 0;
132
+ uint32_t blockStart, blockEnd;
133
+
134
+ if(!tb->idx->maskBlockStart) return;
135
+
136
+ for(i=0; i<tb->idx->maskBlockCount[tid]; i++) {
137
+ blockStart = tb->idx->maskBlockStart[tid][i];
138
+ blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
139
+ if(blockEnd <= start) continue;
140
+ if(blockStart >= end) break;
141
+ if(blockStart < start) {
142
+ blockEnd = (blockEnd < end) ? blockEnd : end;
143
+ pos = 0;
144
+ width = blockEnd - start;
145
+ } else {
146
+ blockEnd = (blockEnd < end) ? blockEnd : end;
147
+ pos = blockStart - start;
148
+ width = blockEnd - blockStart;
149
+ }
150
+ width += pos;
151
+ for(; pos < width; pos++) {
152
+ if(seq[pos] != 'N') seq[pos] = tolower(seq[pos]);
153
+ }
154
+ }
155
+ }
156
+
157
+ /*
158
+ This is the worker function for twobitSequence, which mostly does error checking
159
+ */
160
+ char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
161
+ uint32_t sz = end - start + 1;
162
+ uint32_t blockStart, blockEnd;
163
+ char *seq = malloc(sz * sizeof(char));
164
+ uint8_t *bytes = NULL;
165
+ int offset;
166
+ if(!seq) return NULL;
167
+
168
+ //There are 4 bases/byte
169
+ blockStart = start/4;
170
+ offset = start % 4;
171
+ blockEnd = end/4 + ((end % 4) ? 1 : 0);
172
+ bytes = malloc(blockEnd - blockStart);
173
+ if(!bytes) goto error;
174
+
175
+ if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
176
+ if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
177
+ bytes2bases(seq, bytes, sz - 1, offset);
178
+ free(bytes);
179
+
180
+ //Null terminate the output
181
+ seq[sz - 1] = '\0';
182
+
183
+ //N-mask everything
184
+ NMask(seq, tb, tid, start, end);
185
+
186
+ //Soft-mask if requested
187
+ softMask(seq, tb, tid, start, end);
188
+
189
+ return seq;
190
+
191
+ error:
192
+ if(seq) free(seq);
193
+ if(bytes) free(bytes);
194
+ return NULL;
195
+ }
196
+
197
+ /*
198
+ Given a chromosome, name, and optional range, return the corresponding sequence.
199
+
200
+ The start and end or 0-based half-open, so end-start is the number of bases.
201
+ If both start and end are 0, then the whole chromosome is used.
202
+
203
+ On error (e.g., a missing chromosome), NULL is returned.
204
+ */
205
+ char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
206
+ uint32_t i, tid=0;
207
+
208
+ //Get the chromosome ID
209
+ for(i=0; i<tb->hdr->nChroms; i++) {
210
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) {
211
+ tid = i;
212
+ break;
213
+ }
214
+ }
215
+ if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
216
+
217
+ //Get the start/end if not specified
218
+ if(start == end && end == 0) {
219
+ end = tb->idx->size[tid];
220
+ }
221
+
222
+ //Sanity check the bounds
223
+ if(end > tb->idx->size[tid]) return NULL;
224
+ if(start >= end) return NULL;
225
+
226
+ return constructSequence(tb, tid, start, end);
227
+ }
228
+
229
+ /*
230
+ Given a tid and a position, set the various mask variables to an appropriate block of Ns.
231
+
232
+ * If maskIdx is not -1, these are set to the first overlapping block (or maskIdx is set to the number of N blocks).
233
+ * If maskIdx is not -1 then it's incremented and maskStart/maskEnd set appropriately.
234
+
235
+ If the returned interval doesn't overlap the start/end range, then both values will be -1.
236
+ */
237
+ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd) {
238
+ if(*maskIdx == (uint32_t) -1) {
239
+ for((*maskIdx)=0; (*maskIdx)<tb->idx->nBlockCount[tid]; (*maskIdx)++) {
240
+ *maskStart = tb->idx->nBlockStart[tid][*maskIdx];
241
+ *maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
242
+ if(*maskEnd < start) continue;
243
+ if(*maskEnd >= start) break;
244
+ }
245
+ } else if(*maskIdx >= tb->idx->nBlockCount[tid]) {
246
+ *maskStart = (uint32_t) -1;
247
+ *maskEnd = (uint32_t) -1;
248
+ } else {
249
+ *maskIdx += 1;
250
+ if(*maskIdx >= tb->idx->nBlockCount[tid]) {
251
+ *maskStart = (uint32_t) -1;
252
+ *maskEnd = (uint32_t) -1;
253
+ } else {
254
+ *maskStart = tb->idx->nBlockStart[tid][*maskIdx];
255
+ *maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
256
+ }
257
+ }
258
+
259
+ //maskStart = maskEnd = -1 if no overlap
260
+ if(*maskIdx >= tb->idx->nBlockCount[tid] || *maskStart >= end) {
261
+ *maskStart = (uint32_t) -1;
262
+ *maskEnd = (uint32_t) -1;
263
+ }
264
+ }
265
+
266
+ uint8_t getByteMaskFromOffset(int offset) {
267
+ switch(offset) {
268
+ case 0:
269
+ return (uint8_t) 15;
270
+ case 1:
271
+ return (uint8_t) 7;
272
+ case 2:
273
+ return (uint8_t) 3;
274
+ }
275
+ return 1;
276
+ }
277
+
278
+ void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
279
+ void *out;
280
+ uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
281
+ uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
282
+ uint8_t *bytes = NULL, mask = 0, offset;
283
+
284
+ if(fraction) {
285
+ out = malloc(4 * sizeof(double));
286
+ } else {
287
+ out = malloc(4 * sizeof(uint32_t));
288
+ }
289
+ if(!out) return NULL;
290
+
291
+ //There are 4 bases/byte
292
+ blockStart = start/4;
293
+ offset = start % 4;
294
+ blockEnd = end/4 + ((end % 4) ? 1 : 0);
295
+ bytes = malloc(blockEnd - blockStart);
296
+ if(!bytes) goto error;
297
+
298
+ //Set the initial mask, reset start/offset so we always deal with full bytes
299
+ mask = getByteMaskFromOffset(offset);
300
+ start = 4 * blockStart;
301
+ offset = 0;
302
+
303
+ if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
304
+ if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
305
+
306
+ //Get the index/start/end of the next N-mask block
307
+ getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
308
+
309
+ while(i < len) {
310
+ // Check if we need to jump
311
+ if(maskIdx != -1 && start + i + 4 >= maskStart) {
312
+ if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
313
+ //Jump iff the whole byte is inside an N block
314
+ if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
315
+ //iff we're fully in an N block then jump
316
+ i = maskEnd - start;
317
+ getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
318
+ offset = (start + i) % 4;
319
+ j = i / 4;
320
+ mask = getByteMaskFromOffset(offset);
321
+ i = 4 * j; //Now that the mask has been set, reset i to byte offsets
322
+ offset = 0;
323
+ continue;
324
+ }
325
+
326
+ //Set the mask, if appropriate
327
+ foo = 4*j + 4*blockStart; // The smallest position in the byte
328
+ if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
329
+ if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
330
+ if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
331
+ if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
332
+ if(foo + 4 > maskEnd) {
333
+ getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
334
+ continue;
335
+ }
336
+ }
337
+ }
338
+
339
+ //Ensure that anything after then end is masked
340
+ if(i+4>=len) {
341
+ if((mask & 1) && i+3>=len) mask -=1;
342
+ if((mask & 2) && i+2>=len) mask -=2;
343
+ if((mask & 4) && i+1>=len) mask -=4;
344
+ if((mask & 8) && i>=len) mask -=8;
345
+ }
346
+
347
+ foo = bytes[j++];
348
+ //Offset 3
349
+ if(mask & 1) {
350
+ tmp[foo & 3]++;
351
+ }
352
+ foo >>= 2;
353
+ mask >>= 1;
354
+ //Offset 2
355
+ if(mask & 1) {
356
+ tmp[foo & 3]++;
357
+ }
358
+ foo >>= 2;
359
+ mask >>= 1;
360
+ //Offset 1
361
+ if(mask & 1) {
362
+ tmp[foo & 3]++;
363
+ }
364
+ foo >>= 2;
365
+ mask >>= 1;
366
+ //Offset 0
367
+ if(mask & 1) {
368
+ tmp[foo & 3]++; // offset 0
369
+ }
370
+ i += 4;
371
+ mask = 15;
372
+ }
373
+ free(bytes);
374
+
375
+ //out is in TCAG order, since that's how 2bit is stored.
376
+ //However, for whatever reason I went with ACTG in the first release...
377
+ if(fraction) {
378
+ ((double*) out)[0] = ((double) tmp[2])/((double) len);
379
+ ((double*) out)[1] = ((double) tmp[1])/((double) len);
380
+ ((double*) out)[2] = ((double) tmp[0])/((double) len);
381
+ ((double*) out)[3] = ((double) tmp[3])/((double) len);
382
+ } else {
383
+ ((uint32_t*) out)[0] = tmp[2];
384
+ ((uint32_t*) out)[1] = tmp[1];
385
+ ((uint32_t*) out)[2] = tmp[0];
386
+ ((uint32_t*) out)[3] = tmp[3];
387
+ }
388
+
389
+ return out;
390
+
391
+ error:
392
+ if(out) free(out);
393
+ if(bytes) free(bytes);
394
+ return NULL;
395
+ }
396
+
397
+ void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction) {
398
+ uint32_t tid = 0, i;
399
+
400
+ //Get the chromosome ID
401
+ for(i=0; i<tb->hdr->nChroms; i++) {
402
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) {
403
+ tid = i;
404
+ break;
405
+ }
406
+ }
407
+
408
+ if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
409
+
410
+ //Get the start/end if not specified
411
+ if(start == end && end == 0) {
412
+ end = tb->idx->size[tid];
413
+ }
414
+
415
+ //Sanity check the bounds
416
+ if(end > tb->idx->size[tid]) return NULL;
417
+ if(start >= end) return NULL;
418
+
419
+ return twobitBasesWorker(tb, tid, start, end, fraction);
420
+ }
421
+
422
+ /*
423
+ Given a chromosome, chrom, return it's length. 0 is used if the chromosome isn't present.
424
+ */
425
+ uint32_t twobitChromLen(TwoBit *tb, char *chrom) {
426
+ uint32_t i;
427
+ for(i=0; i<tb->hdr->nChroms; i++) {
428
+ if(strcmp(tb->cl->chrom[i], chrom) == 0) return tb->idx->size[i];
429
+ }
430
+ return 0;
431
+ }
432
+
433
+ /*
434
+ Fill in tb->idx.
435
+
436
+ Note that the masked stuff will only be stored if storeMasked == 1, since it uses gobs of memory otherwise.
437
+ On error, tb->idx is left as NULL.
438
+ */
439
+ void twobitIndexRead(TwoBit *tb, int storeMasked) {
440
+ uint32_t i, data[2];
441
+ TwoBitMaskedIdx *idx = calloc(1, sizeof(TwoBitMaskedIdx));
442
+
443
+ //Allocation and error checking
444
+ if(!idx) return;
445
+ idx->size = malloc(tb->hdr->nChroms * sizeof(uint32_t));
446
+ idx->nBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
447
+ idx->nBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
448
+ idx->nBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
449
+ if(!idx->size) goto error;
450
+ if(!idx->nBlockCount) goto error;
451
+ if(!idx->nBlockStart) goto error;
452
+ if(!idx->nBlockSizes) goto error;
453
+ idx->maskBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
454
+ if(!idx->maskBlockCount) goto error;
455
+ if(storeMasked) {
456
+ idx->maskBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
457
+ idx->maskBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
458
+ if(!idx->maskBlockStart) goto error;
459
+ if(!idx->maskBlockSizes) goto error;
460
+ }
461
+ idx->offset = malloc(tb->hdr->nChroms * sizeof(uint64_t));
462
+ if(!idx->offset) goto error;
463
+
464
+ //Read in each chromosome/contig
465
+ for(i=0; i<tb->hdr->nChroms; i++) {
466
+ if(twobitSeek(tb, tb->cl->offset[i]) != 0) goto error;
467
+ if(twobitRead(data, sizeof(uint32_t), 2, tb) != 2) goto error;
468
+ idx->size[i] = data[0];
469
+ idx->nBlockCount[i] = data[1];
470
+
471
+ //Allocate the nBlock starts/sizes and fill them in
472
+ idx->nBlockStart[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
473
+ idx->nBlockSizes[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
474
+ if(!idx->nBlockStart[i]) goto error;
475
+ if(!idx->nBlockSizes[i]) goto error;
476
+ if(twobitRead(idx->nBlockStart[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
477
+ if(twobitRead(idx->nBlockSizes[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
478
+
479
+ //Get the masked block information
480
+ if(twobitRead(idx->maskBlockCount + i, sizeof(uint32_t), 1, tb) != 1) goto error;
481
+
482
+ //Allocate the maskBlock starts/sizes and fill them in
483
+ if(storeMasked) {
484
+ idx->maskBlockStart[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
485
+ idx->maskBlockSizes[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
486
+ if(!idx->maskBlockStart[i]) goto error;
487
+ if(!idx->maskBlockSizes[i]) goto error;
488
+ if(twobitRead(idx->maskBlockStart[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
489
+ if(twobitRead(idx->maskBlockSizes[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
490
+ } else {
491
+ if(twobitSeek(tb, twobitTell(tb) + 8 * idx->maskBlockCount[i]) != 0) goto error;
492
+ }
493
+
494
+ //Reserved
495
+ if(twobitRead(data, sizeof(uint32_t), 1, tb) != 1) goto error;
496
+
497
+ idx->offset[i] = twobitTell(tb);
498
+ }
499
+
500
+ tb->idx = idx;
501
+ return;
502
+
503
+ error:
504
+ if(idx) {
505
+ if(idx->size) free(idx->size);
506
+
507
+ if(idx->nBlockCount) free(idx->nBlockCount);
508
+ if(idx->nBlockStart) {
509
+ for(i=0; i<tb->hdr->nChroms; i++) {
510
+ if(idx->nBlockStart[i]) free(idx->nBlockStart[i]);
511
+ }
512
+ free(idx->nBlockStart[i]);
513
+ }
514
+ if(idx->nBlockSizes) {
515
+ for(i=0; i<tb->hdr->nChroms; i++) {
516
+ if(idx->nBlockSizes[i]) free(idx->nBlockSizes[i]);
517
+ }
518
+ free(idx->nBlockSizes[i]);
519
+ }
520
+
521
+ if(idx->maskBlockCount) free(idx->maskBlockCount);
522
+ if(idx->maskBlockStart) {
523
+ for(i=0; i<tb->hdr->nChroms; i++) {
524
+ if(idx->maskBlockStart[i]) free(idx->maskBlockStart[i]);
525
+ }
526
+ free(idx->maskBlockStart[i]);
527
+ }
528
+ if(idx->maskBlockSizes) {
529
+ for(i=0; i<tb->hdr->nChroms; i++) {
530
+ if(idx->maskBlockSizes[i]) free(idx->maskBlockSizes[i]);
531
+ }
532
+ free(idx->maskBlockSizes[i]);
533
+ }
534
+
535
+ if(idx->offset) free(idx->offset);
536
+
537
+ free(idx);
538
+ }
539
+ }
540
+
541
+ void twobitIndexDestroy(TwoBit *tb) {
542
+ uint32_t i;
543
+
544
+ if(tb->idx) {
545
+ if(tb->idx->size) free(tb->idx->size);
546
+
547
+ if(tb->idx->nBlockCount) free(tb->idx->nBlockCount);
548
+ if(tb->idx->nBlockStart) {
549
+ for(i=0; i<tb->hdr->nChroms; i++) {
550
+ if(tb->idx->nBlockStart[i]) free(tb->idx->nBlockStart[i]);
551
+ }
552
+ free(tb->idx->nBlockStart);
553
+ }
554
+ if(tb->idx->nBlockSizes) {
555
+ for(i=0; i<tb->hdr->nChroms; i++) {
556
+ if(tb->idx->nBlockSizes[i]) free(tb->idx->nBlockSizes[i]);
557
+ }
558
+ free(tb->idx->nBlockSizes);
559
+ }
560
+
561
+ if(tb->idx->maskBlockCount) free(tb->idx->maskBlockCount);
562
+ if(tb->idx->maskBlockStart) {
563
+ for(i=0; i<tb->hdr->nChroms; i++) {
564
+ if(tb->idx->maskBlockStart[i]) free(tb->idx->maskBlockStart[i]);
565
+ }
566
+ free(tb->idx->maskBlockStart);
567
+ }
568
+ if(tb->idx->maskBlockSizes) {
569
+ for(i=0; i<tb->hdr->nChroms; i++) {
570
+ if(tb->idx->maskBlockSizes[i]) free(tb->idx->maskBlockSizes[i]);
571
+ }
572
+ free(tb->idx->maskBlockSizes);
573
+ }
574
+
575
+ if(tb->idx->offset) free(tb->idx->offset);
576
+
577
+ free(tb->idx);
578
+ }
579
+ }
580
+
581
+ void twobitChromListRead(TwoBit *tb) {
582
+ uint32_t i;
583
+ uint8_t byte;
584
+ char *str = NULL;
585
+ TwoBitCL *cl = calloc(1, sizeof(TwoBitCL));
586
+
587
+ //Allocate cl and do error checking
588
+ if(!cl) goto error;
589
+ cl->chrom = calloc(tb->hdr->nChroms, sizeof(char*));
590
+ cl->offset = malloc(sizeof(uint32_t) * tb->hdr->nChroms);
591
+ if(!cl->chrom) goto error;
592
+ if(!cl->offset) goto error;
593
+
594
+ for(i=0; i<tb->hdr->nChroms; i++) {
595
+ //Get the string size (not null terminated!)
596
+ if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
597
+
598
+ //Read in the string
599
+ str = calloc(1 + byte, sizeof(char));
600
+ if(!str) goto error;
601
+ if(twobitRead(str, 1, byte, tb) != byte) goto error;
602
+ cl->chrom[i] = str;
603
+ str = NULL;
604
+
605
+ //Read in the size
606
+ if(twobitRead(cl->offset + i, sizeof(uint32_t), 1, tb) != 1) goto error;
607
+ }
608
+
609
+ tb->cl = cl;
610
+ return;
611
+
612
+ error:
613
+ if(str) free(str);
614
+ if(cl) {
615
+ if(cl->offset) free(cl->offset);
616
+ if(cl->chrom) {
617
+ for(i=0; i<tb->hdr->nChroms; i++) {
618
+ if(cl->chrom[i]) free(cl->chrom[i]);
619
+ }
620
+ free(cl->chrom);
621
+ }
622
+ free(cl);
623
+ }
624
+ }
625
+
626
+ void twobitChromListDestroy(TwoBit *tb) {
627
+ uint32_t i;
628
+
629
+ if(tb->cl) {
630
+ if(tb->cl->offset) free(tb->cl->offset);
631
+ if(tb->cl->chrom) {
632
+ for(i=0; i<tb->hdr->nChroms; i++) {
633
+ if(tb->cl->chrom[i]) free(tb->cl->chrom[i]);
634
+ }
635
+ free(tb->cl->chrom);
636
+ }
637
+ free(tb->cl);
638
+ }
639
+ }
640
+
641
+ void twobitHdrRead(TwoBit *tb) {
642
+ //Read the first 16 bytes
643
+ uint32_t data[4];
644
+ TwoBitHeader *hdr = calloc(1, sizeof(TwoBitHeader));
645
+
646
+ if(!hdr) return;
647
+
648
+ if(twobitRead(data, 4, 4, tb) != 4) goto error;
649
+
650
+ //Magic
651
+ hdr->magic = data[0];
652
+ if(hdr->magic != 0x1A412743) {
653
+ fprintf(stderr, "[twobitHdrRead] Received an invalid file magic number (0x%"PRIx32")!\n", hdr->magic);
654
+ goto error;
655
+ }
656
+
657
+ //Version
658
+ hdr->version = data[1];
659
+ if(hdr->version != 0) {
660
+ fprintf(stderr, "[twobitHdrRead] The file version is %"PRIu32" while only version 0 is defined!\n", hdr->version);
661
+ goto error;
662
+ }
663
+
664
+ //Sequence Count
665
+ hdr->nChroms = data[2];
666
+ if(hdr->nChroms == 0) {
667
+ fprintf(stderr, "[twobitHdrRead] There are apparently no chromosomes/contigs in this file!\n");
668
+ goto error;
669
+ }
670
+
671
+ tb->hdr = hdr;
672
+ return;
673
+
674
+ error:
675
+ if(hdr) free(hdr);
676
+ }
677
+
678
+ void twobitHdrDestroy(TwoBit *tb) {
679
+ if(tb->hdr) free(tb->hdr);
680
+ }
681
+
682
+ void twobitClose(TwoBit *tb) {
683
+ if(tb) {
684
+ if(tb->fp) fclose(tb->fp);
685
+ if(tb->data) munmap(tb->data, tb->sz);
686
+ twobitChromListDestroy(tb);
687
+ twobitIndexDestroy(tb);
688
+ //N.B., this needs to be called last
689
+ twobitHdrDestroy(tb);
690
+ free(tb);
691
+ }
692
+ }
693
+
694
+ TwoBit* twobitOpen(char *fname, int storeMasked) {
695
+ int fd;
696
+ struct stat fs;
697
+ TwoBit *tb = calloc(1, sizeof(TwoBit));
698
+ if(!tb) return NULL;
699
+
700
+ tb->fp = fopen(fname, "rb");
701
+ if(!tb->fp) goto error;
702
+
703
+ //Try to memory map the whole thing, since these aren't terribly large
704
+ //Since we might be multithreading this in python, use shared memory
705
+ fd = fileno(tb->fp);
706
+ if(fstat(fd, &fs) == 0) {
707
+ tb->sz = (uint64_t) fs.st_size;
708
+ tb->data = mmap(NULL, fs.st_size, PROT_READ, MAP_SHARED, fd, 0);
709
+ if(tb->data) {
710
+ if(madvise(tb->data, fs.st_size, MADV_RANDOM) != 0) {
711
+ munmap(tb->data, fs.st_size);
712
+ tb->data = NULL;
713
+ }
714
+ }
715
+ }
716
+
717
+ //Attempt to read in the fixed header
718
+ twobitHdrRead(tb);
719
+ if(!tb->hdr) goto error;
720
+
721
+ //Read in the chromosome list
722
+ twobitChromListRead(tb);
723
+ if(!tb->cl) goto error;
724
+
725
+ //Read in the mask index
726
+ twobitIndexRead(tb, storeMasked);
727
+ if(!tb->idx) goto error;
728
+
729
+ return tb;
730
+
731
+ error:
732
+ twobitClose(tb);
733
+ return NULL;
734
+ }