bio-twobit 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rubocop.yml +13 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +98 -0
- data/Rakefile +25 -0
- data/bio-twobit.gemspec +29 -0
- data/ext/bio/twobit/2bit.c +734 -0
- data/ext/bio/twobit/2bit.h +134 -0
- data/ext/bio/twobit/LICENSE +24 -0
- data/ext/bio/twobit/extconf.rb +5 -0
- data/ext/bio/twobit/twobit.c +524 -0
- data/ext/bio/twobit/twobit.h +7 -0
- data/lib/bio/twobit/version.rb +7 -0
- data/lib/bio/twobit.rb +40 -0
- metadata +59 -0
@@ -0,0 +1,734 @@
|
|
1
|
+
#include <sys/types.h>
|
2
|
+
#include <sys/stat.h>
|
3
|
+
#include <sys/mman.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include <inttypes.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <string.h>
|
9
|
+
#include <ctype.h>
|
10
|
+
#include "2bit.h"
|
11
|
+
|
12
|
+
uint64_t twobitTell(TwoBit *tb);
|
13
|
+
|
14
|
+
/*
|
15
|
+
Read nmemb elements, each of size sz from the current file offset
|
16
|
+
into data. Return the number of elements read. On error, the return
|
17
|
+
value is either 0 or less than nmemb
|
18
|
+
*/
|
19
|
+
size_t twobitRead(void *data, size_t sz, size_t nmemb, TwoBit *tb) {
|
20
|
+
if(tb->data) {
|
21
|
+
if(memcpy(data, tb->data + tb->offset, nmemb * sz) == NULL) return 0;
|
22
|
+
tb->offset += nmemb * sz;
|
23
|
+
return nmemb;
|
24
|
+
} else {
|
25
|
+
return fread(data, sz, nmemb, tb->fp);
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
/*
|
30
|
+
Seek to a specific position, which is essentially trivial for memmaped stuff
|
31
|
+
|
32
|
+
Returns: 0 on success, -1 on error
|
33
|
+
*/
|
34
|
+
int twobitSeek(TwoBit *tb, uint64_t offset) {
|
35
|
+
if(offset >= tb->sz) return -1;
|
36
|
+
if(tb->data) {
|
37
|
+
tb->offset = offset;
|
38
|
+
return 0;
|
39
|
+
} else {
|
40
|
+
return fseek(tb->fp, (long) offset, SEEK_SET);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
/*
|
45
|
+
Like ftell, but generalized to handle memmaped files
|
46
|
+
|
47
|
+
Returns the offset
|
48
|
+
*/
|
49
|
+
uint64_t twobitTell(TwoBit *tb) {
|
50
|
+
if(tb->data) return tb->offset;
|
51
|
+
return (uint64_t) ftell(tb->fp);
|
52
|
+
}
|
53
|
+
|
54
|
+
/*
|
55
|
+
Given a byte containing 4 bases, return the character representation of the offset'th base
|
56
|
+
*/
|
57
|
+
char byte2base(uint8_t byte, int offset) {
|
58
|
+
int rev = 3 - offset;
|
59
|
+
uint8_t mask = 3 << (2 * rev);
|
60
|
+
int foo = (mask & byte) >> (2 * rev);
|
61
|
+
char bases[4] = "TCAG";
|
62
|
+
return bases[foo];
|
63
|
+
}
|
64
|
+
|
65
|
+
void bytes2bases(char *seq, uint8_t *byte, uint32_t sz, int offset) {
|
66
|
+
uint32_t pos = 0, remainder = 0, i = 0;
|
67
|
+
char bases[4] = "TCAG";
|
68
|
+
uint8_t foo = byte[0];
|
69
|
+
|
70
|
+
// Deal with the first partial byte
|
71
|
+
if(offset != 0) {
|
72
|
+
while(offset < 4) {
|
73
|
+
seq[pos++] = byte2base(foo, offset++);
|
74
|
+
}
|
75
|
+
if(pos >= sz) return;
|
76
|
+
foo = byte[++i];
|
77
|
+
}
|
78
|
+
|
79
|
+
// Deal with everything else, with the possible exception of the last fractional byte
|
80
|
+
remainder = (sz - pos) % 4;
|
81
|
+
while(pos < sz - remainder) {
|
82
|
+
foo = byte[i++];
|
83
|
+
seq[pos + 3] = bases[foo & 3];
|
84
|
+
foo >>= 2;
|
85
|
+
seq[pos + 2] = bases[foo & 3];
|
86
|
+
foo >>= 2;
|
87
|
+
seq[pos + 1] = bases[foo & 3];
|
88
|
+
foo >>= 2;
|
89
|
+
seq[pos] = bases[foo & 3];
|
90
|
+
foo >>= 2;
|
91
|
+
pos += 4;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Deal with the last partial byte
|
95
|
+
if(remainder > 0) foo = byte[i];
|
96
|
+
for(offset=0; offset<remainder; offset++) {
|
97
|
+
seq[pos++] = byte2base(foo, offset);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
/*
|
102
|
+
Replace Ts (or whatever else is being used) with N as appropriate
|
103
|
+
*/
|
104
|
+
void NMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
105
|
+
uint32_t i, width, pos = 0;
|
106
|
+
uint32_t blockStart, blockEnd;
|
107
|
+
|
108
|
+
for(i=0; i<tb->idx->nBlockCount[tid]; i++) {
|
109
|
+
blockStart = tb->idx->nBlockStart[tid][i];
|
110
|
+
blockEnd = blockStart + tb->idx->nBlockSizes[tid][i];
|
111
|
+
if(blockEnd <= start) continue;
|
112
|
+
if(blockStart >= end) break;
|
113
|
+
if(blockStart < start) {
|
114
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
115
|
+
pos = 0;
|
116
|
+
width = blockEnd - start;
|
117
|
+
} else {
|
118
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
119
|
+
pos = blockStart - start;
|
120
|
+
width = blockEnd - blockStart;
|
121
|
+
}
|
122
|
+
width += pos;
|
123
|
+
for(; pos < width; pos++) seq[pos] = 'N';
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
/*
|
128
|
+
Replace uppercase with lower-case letters, if required
|
129
|
+
*/
|
130
|
+
void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
131
|
+
uint32_t i, width, pos = 0;
|
132
|
+
uint32_t blockStart, blockEnd;
|
133
|
+
|
134
|
+
if(!tb->idx->maskBlockStart) return;
|
135
|
+
|
136
|
+
for(i=0; i<tb->idx->maskBlockCount[tid]; i++) {
|
137
|
+
blockStart = tb->idx->maskBlockStart[tid][i];
|
138
|
+
blockEnd = blockStart + tb->idx->maskBlockSizes[tid][i];
|
139
|
+
if(blockEnd <= start) continue;
|
140
|
+
if(blockStart >= end) break;
|
141
|
+
if(blockStart < start) {
|
142
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
143
|
+
pos = 0;
|
144
|
+
width = blockEnd - start;
|
145
|
+
} else {
|
146
|
+
blockEnd = (blockEnd < end) ? blockEnd : end;
|
147
|
+
pos = blockStart - start;
|
148
|
+
width = blockEnd - blockStart;
|
149
|
+
}
|
150
|
+
width += pos;
|
151
|
+
for(; pos < width; pos++) {
|
152
|
+
if(seq[pos] != 'N') seq[pos] = tolower(seq[pos]);
|
153
|
+
}
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
/*
|
158
|
+
This is the worker function for twobitSequence, which mostly does error checking
|
159
|
+
*/
|
160
|
+
char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
|
161
|
+
uint32_t sz = end - start + 1;
|
162
|
+
uint32_t blockStart, blockEnd;
|
163
|
+
char *seq = malloc(sz * sizeof(char));
|
164
|
+
uint8_t *bytes = NULL;
|
165
|
+
int offset;
|
166
|
+
if(!seq) return NULL;
|
167
|
+
|
168
|
+
//There are 4 bases/byte
|
169
|
+
blockStart = start/4;
|
170
|
+
offset = start % 4;
|
171
|
+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
|
172
|
+
bytes = malloc(blockEnd - blockStart);
|
173
|
+
if(!bytes) goto error;
|
174
|
+
|
175
|
+
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
|
176
|
+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
|
177
|
+
bytes2bases(seq, bytes, sz - 1, offset);
|
178
|
+
free(bytes);
|
179
|
+
|
180
|
+
//Null terminate the output
|
181
|
+
seq[sz - 1] = '\0';
|
182
|
+
|
183
|
+
//N-mask everything
|
184
|
+
NMask(seq, tb, tid, start, end);
|
185
|
+
|
186
|
+
//Soft-mask if requested
|
187
|
+
softMask(seq, tb, tid, start, end);
|
188
|
+
|
189
|
+
return seq;
|
190
|
+
|
191
|
+
error:
|
192
|
+
if(seq) free(seq);
|
193
|
+
if(bytes) free(bytes);
|
194
|
+
return NULL;
|
195
|
+
}
|
196
|
+
|
197
|
+
/*
|
198
|
+
Given a chromosome, name, and optional range, return the corresponding sequence.
|
199
|
+
|
200
|
+
The start and end or 0-based half-open, so end-start is the number of bases.
|
201
|
+
If both start and end are 0, then the whole chromosome is used.
|
202
|
+
|
203
|
+
On error (e.g., a missing chromosome), NULL is returned.
|
204
|
+
*/
|
205
|
+
char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
|
206
|
+
uint32_t i, tid=0;
|
207
|
+
|
208
|
+
//Get the chromosome ID
|
209
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
210
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) {
|
211
|
+
tid = i;
|
212
|
+
break;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
|
216
|
+
|
217
|
+
//Get the start/end if not specified
|
218
|
+
if(start == end && end == 0) {
|
219
|
+
end = tb->idx->size[tid];
|
220
|
+
}
|
221
|
+
|
222
|
+
//Sanity check the bounds
|
223
|
+
if(end > tb->idx->size[tid]) return NULL;
|
224
|
+
if(start >= end) return NULL;
|
225
|
+
|
226
|
+
return constructSequence(tb, tid, start, end);
|
227
|
+
}
|
228
|
+
|
229
|
+
/*
|
230
|
+
Given a tid and a position, set the various mask variables to an appropriate block of Ns.
|
231
|
+
|
232
|
+
* If maskIdx is not -1, these are set to the first overlapping block (or maskIdx is set to the number of N blocks).
|
233
|
+
* If maskIdx is not -1 then it's incremented and maskStart/maskEnd set appropriately.
|
234
|
+
|
235
|
+
If the returned interval doesn't overlap the start/end range, then both values will be -1.
|
236
|
+
*/
|
237
|
+
void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd) {
|
238
|
+
if(*maskIdx == (uint32_t) -1) {
|
239
|
+
for((*maskIdx)=0; (*maskIdx)<tb->idx->nBlockCount[tid]; (*maskIdx)++) {
|
240
|
+
*maskStart = tb->idx->nBlockStart[tid][*maskIdx];
|
241
|
+
*maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
|
242
|
+
if(*maskEnd < start) continue;
|
243
|
+
if(*maskEnd >= start) break;
|
244
|
+
}
|
245
|
+
} else if(*maskIdx >= tb->idx->nBlockCount[tid]) {
|
246
|
+
*maskStart = (uint32_t) -1;
|
247
|
+
*maskEnd = (uint32_t) -1;
|
248
|
+
} else {
|
249
|
+
*maskIdx += 1;
|
250
|
+
if(*maskIdx >= tb->idx->nBlockCount[tid]) {
|
251
|
+
*maskStart = (uint32_t) -1;
|
252
|
+
*maskEnd = (uint32_t) -1;
|
253
|
+
} else {
|
254
|
+
*maskStart = tb->idx->nBlockStart[tid][*maskIdx];
|
255
|
+
*maskEnd = (*maskStart) + tb->idx->nBlockSizes[tid][*maskIdx];
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
//maskStart = maskEnd = -1 if no overlap
|
260
|
+
if(*maskIdx >= tb->idx->nBlockCount[tid] || *maskStart >= end) {
|
261
|
+
*maskStart = (uint32_t) -1;
|
262
|
+
*maskEnd = (uint32_t) -1;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
|
266
|
+
uint8_t getByteMaskFromOffset(int offset) {
|
267
|
+
switch(offset) {
|
268
|
+
case 0:
|
269
|
+
return (uint8_t) 15;
|
270
|
+
case 1:
|
271
|
+
return (uint8_t) 7;
|
272
|
+
case 2:
|
273
|
+
return (uint8_t) 3;
|
274
|
+
}
|
275
|
+
return 1;
|
276
|
+
}
|
277
|
+
|
278
|
+
void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
|
279
|
+
void *out;
|
280
|
+
uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
|
281
|
+
uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
|
282
|
+
uint8_t *bytes = NULL, mask = 0, offset;
|
283
|
+
|
284
|
+
if(fraction) {
|
285
|
+
out = malloc(4 * sizeof(double));
|
286
|
+
} else {
|
287
|
+
out = malloc(4 * sizeof(uint32_t));
|
288
|
+
}
|
289
|
+
if(!out) return NULL;
|
290
|
+
|
291
|
+
//There are 4 bases/byte
|
292
|
+
blockStart = start/4;
|
293
|
+
offset = start % 4;
|
294
|
+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
|
295
|
+
bytes = malloc(blockEnd - blockStart);
|
296
|
+
if(!bytes) goto error;
|
297
|
+
|
298
|
+
//Set the initial mask, reset start/offset so we always deal with full bytes
|
299
|
+
mask = getByteMaskFromOffset(offset);
|
300
|
+
start = 4 * blockStart;
|
301
|
+
offset = 0;
|
302
|
+
|
303
|
+
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
|
304
|
+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
|
305
|
+
|
306
|
+
//Get the index/start/end of the next N-mask block
|
307
|
+
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
|
308
|
+
|
309
|
+
while(i < len) {
|
310
|
+
// Check if we need to jump
|
311
|
+
if(maskIdx != -1 && start + i + 4 >= maskStart) {
|
312
|
+
if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
|
313
|
+
//Jump iff the whole byte is inside an N block
|
314
|
+
if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
|
315
|
+
//iff we're fully in an N block then jump
|
316
|
+
i = maskEnd - start;
|
317
|
+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
|
318
|
+
offset = (start + i) % 4;
|
319
|
+
j = i / 4;
|
320
|
+
mask = getByteMaskFromOffset(offset);
|
321
|
+
i = 4 * j; //Now that the mask has been set, reset i to byte offsets
|
322
|
+
offset = 0;
|
323
|
+
continue;
|
324
|
+
}
|
325
|
+
|
326
|
+
//Set the mask, if appropriate
|
327
|
+
foo = 4*j + 4*blockStart; // The smallest position in the byte
|
328
|
+
if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
|
329
|
+
if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
|
330
|
+
if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
|
331
|
+
if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
|
332
|
+
if(foo + 4 > maskEnd) {
|
333
|
+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
|
334
|
+
continue;
|
335
|
+
}
|
336
|
+
}
|
337
|
+
}
|
338
|
+
|
339
|
+
//Ensure that anything after then end is masked
|
340
|
+
if(i+4>=len) {
|
341
|
+
if((mask & 1) && i+3>=len) mask -=1;
|
342
|
+
if((mask & 2) && i+2>=len) mask -=2;
|
343
|
+
if((mask & 4) && i+1>=len) mask -=4;
|
344
|
+
if((mask & 8) && i>=len) mask -=8;
|
345
|
+
}
|
346
|
+
|
347
|
+
foo = bytes[j++];
|
348
|
+
//Offset 3
|
349
|
+
if(mask & 1) {
|
350
|
+
tmp[foo & 3]++;
|
351
|
+
}
|
352
|
+
foo >>= 2;
|
353
|
+
mask >>= 1;
|
354
|
+
//Offset 2
|
355
|
+
if(mask & 1) {
|
356
|
+
tmp[foo & 3]++;
|
357
|
+
}
|
358
|
+
foo >>= 2;
|
359
|
+
mask >>= 1;
|
360
|
+
//Offset 1
|
361
|
+
if(mask & 1) {
|
362
|
+
tmp[foo & 3]++;
|
363
|
+
}
|
364
|
+
foo >>= 2;
|
365
|
+
mask >>= 1;
|
366
|
+
//Offset 0
|
367
|
+
if(mask & 1) {
|
368
|
+
tmp[foo & 3]++; // offset 0
|
369
|
+
}
|
370
|
+
i += 4;
|
371
|
+
mask = 15;
|
372
|
+
}
|
373
|
+
free(bytes);
|
374
|
+
|
375
|
+
//out is in TCAG order, since that's how 2bit is stored.
|
376
|
+
//However, for whatever reason I went with ACTG in the first release...
|
377
|
+
if(fraction) {
|
378
|
+
((double*) out)[0] = ((double) tmp[2])/((double) len);
|
379
|
+
((double*) out)[1] = ((double) tmp[1])/((double) len);
|
380
|
+
((double*) out)[2] = ((double) tmp[0])/((double) len);
|
381
|
+
((double*) out)[3] = ((double) tmp[3])/((double) len);
|
382
|
+
} else {
|
383
|
+
((uint32_t*) out)[0] = tmp[2];
|
384
|
+
((uint32_t*) out)[1] = tmp[1];
|
385
|
+
((uint32_t*) out)[2] = tmp[0];
|
386
|
+
((uint32_t*) out)[3] = tmp[3];
|
387
|
+
}
|
388
|
+
|
389
|
+
return out;
|
390
|
+
|
391
|
+
error:
|
392
|
+
if(out) free(out);
|
393
|
+
if(bytes) free(bytes);
|
394
|
+
return NULL;
|
395
|
+
}
|
396
|
+
|
397
|
+
void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction) {
|
398
|
+
uint32_t tid = 0, i;
|
399
|
+
|
400
|
+
//Get the chromosome ID
|
401
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
402
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) {
|
403
|
+
tid = i;
|
404
|
+
break;
|
405
|
+
}
|
406
|
+
}
|
407
|
+
|
408
|
+
if(tid == 0 && strcmp(tb->cl->chrom[i], chrom) != 0) return NULL;
|
409
|
+
|
410
|
+
//Get the start/end if not specified
|
411
|
+
if(start == end && end == 0) {
|
412
|
+
end = tb->idx->size[tid];
|
413
|
+
}
|
414
|
+
|
415
|
+
//Sanity check the bounds
|
416
|
+
if(end > tb->idx->size[tid]) return NULL;
|
417
|
+
if(start >= end) return NULL;
|
418
|
+
|
419
|
+
return twobitBasesWorker(tb, tid, start, end, fraction);
|
420
|
+
}
|
421
|
+
|
422
|
+
/*
|
423
|
+
Given a chromosome, chrom, return it's length. 0 is used if the chromosome isn't present.
|
424
|
+
*/
|
425
|
+
uint32_t twobitChromLen(TwoBit *tb, char *chrom) {
|
426
|
+
uint32_t i;
|
427
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
428
|
+
if(strcmp(tb->cl->chrom[i], chrom) == 0) return tb->idx->size[i];
|
429
|
+
}
|
430
|
+
return 0;
|
431
|
+
}
|
432
|
+
|
433
|
+
/*
|
434
|
+
Fill in tb->idx.
|
435
|
+
|
436
|
+
Note that the masked stuff will only be stored if storeMasked == 1, since it uses gobs of memory otherwise.
|
437
|
+
On error, tb->idx is left as NULL.
|
438
|
+
*/
|
439
|
+
void twobitIndexRead(TwoBit *tb, int storeMasked) {
|
440
|
+
uint32_t i, data[2];
|
441
|
+
TwoBitMaskedIdx *idx = calloc(1, sizeof(TwoBitMaskedIdx));
|
442
|
+
|
443
|
+
//Allocation and error checking
|
444
|
+
if(!idx) return;
|
445
|
+
idx->size = malloc(tb->hdr->nChroms * sizeof(uint32_t));
|
446
|
+
idx->nBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
|
447
|
+
idx->nBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
448
|
+
idx->nBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
449
|
+
if(!idx->size) goto error;
|
450
|
+
if(!idx->nBlockCount) goto error;
|
451
|
+
if(!idx->nBlockStart) goto error;
|
452
|
+
if(!idx->nBlockSizes) goto error;
|
453
|
+
idx->maskBlockCount = calloc(tb->hdr->nChroms, sizeof(uint32_t));
|
454
|
+
if(!idx->maskBlockCount) goto error;
|
455
|
+
if(storeMasked) {
|
456
|
+
idx->maskBlockStart = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
457
|
+
idx->maskBlockSizes = calloc(tb->hdr->nChroms, sizeof(uint32_t*));
|
458
|
+
if(!idx->maskBlockStart) goto error;
|
459
|
+
if(!idx->maskBlockSizes) goto error;
|
460
|
+
}
|
461
|
+
idx->offset = malloc(tb->hdr->nChroms * sizeof(uint64_t));
|
462
|
+
if(!idx->offset) goto error;
|
463
|
+
|
464
|
+
//Read in each chromosome/contig
|
465
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
466
|
+
if(twobitSeek(tb, tb->cl->offset[i]) != 0) goto error;
|
467
|
+
if(twobitRead(data, sizeof(uint32_t), 2, tb) != 2) goto error;
|
468
|
+
idx->size[i] = data[0];
|
469
|
+
idx->nBlockCount[i] = data[1];
|
470
|
+
|
471
|
+
//Allocate the nBlock starts/sizes and fill them in
|
472
|
+
idx->nBlockStart[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
|
473
|
+
idx->nBlockSizes[i] = malloc(idx->nBlockCount[i] * sizeof(uint32_t));
|
474
|
+
if(!idx->nBlockStart[i]) goto error;
|
475
|
+
if(!idx->nBlockSizes[i]) goto error;
|
476
|
+
if(twobitRead(idx->nBlockStart[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
|
477
|
+
if(twobitRead(idx->nBlockSizes[i], sizeof(uint32_t), idx->nBlockCount[i], tb) != idx->nBlockCount[i]) goto error;
|
478
|
+
|
479
|
+
//Get the masked block information
|
480
|
+
if(twobitRead(idx->maskBlockCount + i, sizeof(uint32_t), 1, tb) != 1) goto error;
|
481
|
+
|
482
|
+
//Allocate the maskBlock starts/sizes and fill them in
|
483
|
+
if(storeMasked) {
|
484
|
+
idx->maskBlockStart[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
|
485
|
+
idx->maskBlockSizes[i] = malloc(idx->maskBlockCount[i] * sizeof(uint32_t));
|
486
|
+
if(!idx->maskBlockStart[i]) goto error;
|
487
|
+
if(!idx->maskBlockSizes[i]) goto error;
|
488
|
+
if(twobitRead(idx->maskBlockStart[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
|
489
|
+
if(twobitRead(idx->maskBlockSizes[i], sizeof(uint32_t), idx->maskBlockCount[i], tb) != idx->maskBlockCount[i]) goto error;
|
490
|
+
} else {
|
491
|
+
if(twobitSeek(tb, twobitTell(tb) + 8 * idx->maskBlockCount[i]) != 0) goto error;
|
492
|
+
}
|
493
|
+
|
494
|
+
//Reserved
|
495
|
+
if(twobitRead(data, sizeof(uint32_t), 1, tb) != 1) goto error;
|
496
|
+
|
497
|
+
idx->offset[i] = twobitTell(tb);
|
498
|
+
}
|
499
|
+
|
500
|
+
tb->idx = idx;
|
501
|
+
return;
|
502
|
+
|
503
|
+
error:
|
504
|
+
if(idx) {
|
505
|
+
if(idx->size) free(idx->size);
|
506
|
+
|
507
|
+
if(idx->nBlockCount) free(idx->nBlockCount);
|
508
|
+
if(idx->nBlockStart) {
|
509
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
510
|
+
if(idx->nBlockStart[i]) free(idx->nBlockStart[i]);
|
511
|
+
}
|
512
|
+
free(idx->nBlockStart[i]);
|
513
|
+
}
|
514
|
+
if(idx->nBlockSizes) {
|
515
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
516
|
+
if(idx->nBlockSizes[i]) free(idx->nBlockSizes[i]);
|
517
|
+
}
|
518
|
+
free(idx->nBlockSizes[i]);
|
519
|
+
}
|
520
|
+
|
521
|
+
if(idx->maskBlockCount) free(idx->maskBlockCount);
|
522
|
+
if(idx->maskBlockStart) {
|
523
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
524
|
+
if(idx->maskBlockStart[i]) free(idx->maskBlockStart[i]);
|
525
|
+
}
|
526
|
+
free(idx->maskBlockStart[i]);
|
527
|
+
}
|
528
|
+
if(idx->maskBlockSizes) {
|
529
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
530
|
+
if(idx->maskBlockSizes[i]) free(idx->maskBlockSizes[i]);
|
531
|
+
}
|
532
|
+
free(idx->maskBlockSizes[i]);
|
533
|
+
}
|
534
|
+
|
535
|
+
if(idx->offset) free(idx->offset);
|
536
|
+
|
537
|
+
free(idx);
|
538
|
+
}
|
539
|
+
}
|
540
|
+
|
541
|
+
void twobitIndexDestroy(TwoBit *tb) {
|
542
|
+
uint32_t i;
|
543
|
+
|
544
|
+
if(tb->idx) {
|
545
|
+
if(tb->idx->size) free(tb->idx->size);
|
546
|
+
|
547
|
+
if(tb->idx->nBlockCount) free(tb->idx->nBlockCount);
|
548
|
+
if(tb->idx->nBlockStart) {
|
549
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
550
|
+
if(tb->idx->nBlockStart[i]) free(tb->idx->nBlockStart[i]);
|
551
|
+
}
|
552
|
+
free(tb->idx->nBlockStart);
|
553
|
+
}
|
554
|
+
if(tb->idx->nBlockSizes) {
|
555
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
556
|
+
if(tb->idx->nBlockSizes[i]) free(tb->idx->nBlockSizes[i]);
|
557
|
+
}
|
558
|
+
free(tb->idx->nBlockSizes);
|
559
|
+
}
|
560
|
+
|
561
|
+
if(tb->idx->maskBlockCount) free(tb->idx->maskBlockCount);
|
562
|
+
if(tb->idx->maskBlockStart) {
|
563
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
564
|
+
if(tb->idx->maskBlockStart[i]) free(tb->idx->maskBlockStart[i]);
|
565
|
+
}
|
566
|
+
free(tb->idx->maskBlockStart);
|
567
|
+
}
|
568
|
+
if(tb->idx->maskBlockSizes) {
|
569
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
570
|
+
if(tb->idx->maskBlockSizes[i]) free(tb->idx->maskBlockSizes[i]);
|
571
|
+
}
|
572
|
+
free(tb->idx->maskBlockSizes);
|
573
|
+
}
|
574
|
+
|
575
|
+
if(tb->idx->offset) free(tb->idx->offset);
|
576
|
+
|
577
|
+
free(tb->idx);
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
void twobitChromListRead(TwoBit *tb) {
|
582
|
+
uint32_t i;
|
583
|
+
uint8_t byte;
|
584
|
+
char *str = NULL;
|
585
|
+
TwoBitCL *cl = calloc(1, sizeof(TwoBitCL));
|
586
|
+
|
587
|
+
//Allocate cl and do error checking
|
588
|
+
if(!cl) goto error;
|
589
|
+
cl->chrom = calloc(tb->hdr->nChroms, sizeof(char*));
|
590
|
+
cl->offset = malloc(sizeof(uint32_t) * tb->hdr->nChroms);
|
591
|
+
if(!cl->chrom) goto error;
|
592
|
+
if(!cl->offset) goto error;
|
593
|
+
|
594
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
595
|
+
//Get the string size (not null terminated!)
|
596
|
+
if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
|
597
|
+
|
598
|
+
//Read in the string
|
599
|
+
str = calloc(1 + byte, sizeof(char));
|
600
|
+
if(!str) goto error;
|
601
|
+
if(twobitRead(str, 1, byte, tb) != byte) goto error;
|
602
|
+
cl->chrom[i] = str;
|
603
|
+
str = NULL;
|
604
|
+
|
605
|
+
//Read in the size
|
606
|
+
if(twobitRead(cl->offset + i, sizeof(uint32_t), 1, tb) != 1) goto error;
|
607
|
+
}
|
608
|
+
|
609
|
+
tb->cl = cl;
|
610
|
+
return;
|
611
|
+
|
612
|
+
error:
|
613
|
+
if(str) free(str);
|
614
|
+
if(cl) {
|
615
|
+
if(cl->offset) free(cl->offset);
|
616
|
+
if(cl->chrom) {
|
617
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
618
|
+
if(cl->chrom[i]) free(cl->chrom[i]);
|
619
|
+
}
|
620
|
+
free(cl->chrom);
|
621
|
+
}
|
622
|
+
free(cl);
|
623
|
+
}
|
624
|
+
}
|
625
|
+
|
626
|
+
void twobitChromListDestroy(TwoBit *tb) {
|
627
|
+
uint32_t i;
|
628
|
+
|
629
|
+
if(tb->cl) {
|
630
|
+
if(tb->cl->offset) free(tb->cl->offset);
|
631
|
+
if(tb->cl->chrom) {
|
632
|
+
for(i=0; i<tb->hdr->nChroms; i++) {
|
633
|
+
if(tb->cl->chrom[i]) free(tb->cl->chrom[i]);
|
634
|
+
}
|
635
|
+
free(tb->cl->chrom);
|
636
|
+
}
|
637
|
+
free(tb->cl);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
|
641
|
+
void twobitHdrRead(TwoBit *tb) {
|
642
|
+
//Read the first 16 bytes
|
643
|
+
uint32_t data[4];
|
644
|
+
TwoBitHeader *hdr = calloc(1, sizeof(TwoBitHeader));
|
645
|
+
|
646
|
+
if(!hdr) return;
|
647
|
+
|
648
|
+
if(twobitRead(data, 4, 4, tb) != 4) goto error;
|
649
|
+
|
650
|
+
//Magic
|
651
|
+
hdr->magic = data[0];
|
652
|
+
if(hdr->magic != 0x1A412743) {
|
653
|
+
fprintf(stderr, "[twobitHdrRead] Received an invalid file magic number (0x%"PRIx32")!\n", hdr->magic);
|
654
|
+
goto error;
|
655
|
+
}
|
656
|
+
|
657
|
+
//Version
|
658
|
+
hdr->version = data[1];
|
659
|
+
if(hdr->version != 0) {
|
660
|
+
fprintf(stderr, "[twobitHdrRead] The file version is %"PRIu32" while only version 0 is defined!\n", hdr->version);
|
661
|
+
goto error;
|
662
|
+
}
|
663
|
+
|
664
|
+
//Sequence Count
|
665
|
+
hdr->nChroms = data[2];
|
666
|
+
if(hdr->nChroms == 0) {
|
667
|
+
fprintf(stderr, "[twobitHdrRead] There are apparently no chromosomes/contigs in this file!\n");
|
668
|
+
goto error;
|
669
|
+
}
|
670
|
+
|
671
|
+
tb->hdr = hdr;
|
672
|
+
return;
|
673
|
+
|
674
|
+
error:
|
675
|
+
if(hdr) free(hdr);
|
676
|
+
}
|
677
|
+
|
678
|
+
void twobitHdrDestroy(TwoBit *tb) {
|
679
|
+
if(tb->hdr) free(tb->hdr);
|
680
|
+
}
|
681
|
+
|
682
|
+
void twobitClose(TwoBit *tb) {
|
683
|
+
if(tb) {
|
684
|
+
if(tb->fp) fclose(tb->fp);
|
685
|
+
if(tb->data) munmap(tb->data, tb->sz);
|
686
|
+
twobitChromListDestroy(tb);
|
687
|
+
twobitIndexDestroy(tb);
|
688
|
+
//N.B., this needs to be called last
|
689
|
+
twobitHdrDestroy(tb);
|
690
|
+
free(tb);
|
691
|
+
}
|
692
|
+
}
|
693
|
+
|
694
|
+
TwoBit* twobitOpen(char *fname, int storeMasked) {
|
695
|
+
int fd;
|
696
|
+
struct stat fs;
|
697
|
+
TwoBit *tb = calloc(1, sizeof(TwoBit));
|
698
|
+
if(!tb) return NULL;
|
699
|
+
|
700
|
+
tb->fp = fopen(fname, "rb");
|
701
|
+
if(!tb->fp) goto error;
|
702
|
+
|
703
|
+
//Try to memory map the whole thing, since these aren't terribly large
|
704
|
+
//Since we might be multithreading this in python, use shared memory
|
705
|
+
fd = fileno(tb->fp);
|
706
|
+
if(fstat(fd, &fs) == 0) {
|
707
|
+
tb->sz = (uint64_t) fs.st_size;
|
708
|
+
tb->data = mmap(NULL, fs.st_size, PROT_READ, MAP_SHARED, fd, 0);
|
709
|
+
if(tb->data) {
|
710
|
+
if(madvise(tb->data, fs.st_size, MADV_RANDOM) != 0) {
|
711
|
+
munmap(tb->data, fs.st_size);
|
712
|
+
tb->data = NULL;
|
713
|
+
}
|
714
|
+
}
|
715
|
+
}
|
716
|
+
|
717
|
+
//Attempt to read in the fixed header
|
718
|
+
twobitHdrRead(tb);
|
719
|
+
if(!tb->hdr) goto error;
|
720
|
+
|
721
|
+
//Read in the chromosome list
|
722
|
+
twobitChromListRead(tb);
|
723
|
+
if(!tb->cl) goto error;
|
724
|
+
|
725
|
+
//Read in the mask index
|
726
|
+
twobitIndexRead(tb, storeMasked);
|
727
|
+
if(!tb->idx) goto error;
|
728
|
+
|
729
|
+
return tb;
|
730
|
+
|
731
|
+
error:
|
732
|
+
twobitClose(tb);
|
733
|
+
return NULL;
|
734
|
+
}
|