bio-bigwig 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/bio/bigwig/libBigWig/LICENSE +22 -0
- data/ext/bio/bigwig/libBigWig/bigWig.h +606 -0
- data/ext/bio/bigwig/libBigWig/bigWigIO.h +110 -0
- data/ext/bio/bigwig/libBigWig/bwCommon.h +74 -0
- data/ext/bio/bigwig/libBigWig/bwRead.c +438 -0
- data/ext/bio/bigwig/libBigWig/bwStats.c +537 -0
- data/ext/bio/bigwig/libBigWig/bwValues.c +803 -0
- data/ext/bio/bigwig/libBigWig/bwValues.h +77 -0
- data/ext/bio/bigwig/libBigWig/bwWrite.c +1333 -0
- data/ext/bio/bigwig/libBigWig/io.c +296 -0
- data/lib/bio/bigwig/version.rb +1 -1
- metadata +11 -1
@@ -0,0 +1,803 @@
|
|
1
|
+
#include "bigWig.h"
|
2
|
+
#include "bwCommon.h"
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <zlib.h>
|
7
|
+
#include <errno.h>
|
8
|
+
|
9
|
+
static uint32_t roundup(uint32_t v) {
|
10
|
+
v--;
|
11
|
+
v |= v >> 1;
|
12
|
+
v |= v >> 2;
|
13
|
+
v |= v >> 4;
|
14
|
+
v |= v >> 8;
|
15
|
+
v |= v >> 16;
|
16
|
+
v++;
|
17
|
+
return v;
|
18
|
+
}
|
19
|
+
|
20
|
+
//Returns the root node on success and NULL on error
|
21
|
+
static bwRTree_t *readRTreeIdx(bigWigFile_t *fp, uint64_t offset) {
|
22
|
+
uint32_t magic;
|
23
|
+
bwRTree_t *node;
|
24
|
+
|
25
|
+
if(!offset) {
|
26
|
+
if(bwSetPos(fp, fp->hdr->indexOffset)) return NULL;
|
27
|
+
} else {
|
28
|
+
if(bwSetPos(fp, offset)) return NULL;
|
29
|
+
}
|
30
|
+
|
31
|
+
if(bwRead(&magic, sizeof(uint32_t), 1, fp) != 1) return NULL;
|
32
|
+
if(magic != IDX_MAGIC) {
|
33
|
+
fprintf(stderr, "[readRTreeIdx] Mismatch in the magic number!\n");
|
34
|
+
return NULL;
|
35
|
+
}
|
36
|
+
|
37
|
+
node = calloc(1, sizeof(bwRTree_t));
|
38
|
+
if(!node) return NULL;
|
39
|
+
|
40
|
+
if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error;
|
41
|
+
if(bwRead(&(node->nItems), sizeof(uint64_t), 1, fp) != 1) goto error;
|
42
|
+
if(bwRead(&(node->chrIdxStart), sizeof(uint32_t), 1, fp) != 1) goto error;
|
43
|
+
if(bwRead(&(node->baseStart), sizeof(uint32_t), 1, fp) != 1) goto error;
|
44
|
+
if(bwRead(&(node->chrIdxEnd), sizeof(uint32_t), 1, fp) != 1) goto error;
|
45
|
+
if(bwRead(&(node->baseEnd), sizeof(uint32_t), 1, fp) != 1) goto error;
|
46
|
+
if(bwRead(&(node->idxSize), sizeof(uint64_t), 1, fp) != 1) goto error;
|
47
|
+
if(bwRead(&(node->nItemsPerSlot), sizeof(uint32_t), 1, fp) != 1) goto error;
|
48
|
+
//Padding
|
49
|
+
if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error;
|
50
|
+
node->rootOffset = bwTell(fp);
|
51
|
+
|
52
|
+
//For remote files, libCurl sometimes sets errno to 115 and doesn't clear it
|
53
|
+
errno = 0;
|
54
|
+
|
55
|
+
return node;
|
56
|
+
|
57
|
+
error:
|
58
|
+
free(node);
|
59
|
+
return NULL;
|
60
|
+
}
|
61
|
+
|
62
|
+
//Returns a bwRTreeNode_t on success and NULL on an error
|
63
|
+
//For the root node, set offset to 0
|
64
|
+
static bwRTreeNode_t *bwGetRTreeNode(bigWigFile_t *fp, uint64_t offset) {
|
65
|
+
bwRTreeNode_t *node = NULL;
|
66
|
+
uint8_t padding;
|
67
|
+
uint16_t i;
|
68
|
+
if(offset) {
|
69
|
+
if(bwSetPos(fp, offset)) return NULL;
|
70
|
+
} else {
|
71
|
+
//seek
|
72
|
+
if(bwSetPos(fp, fp->idx->rootOffset)) return NULL;
|
73
|
+
}
|
74
|
+
|
75
|
+
node = calloc(1, sizeof(bwRTreeNode_t));
|
76
|
+
if(!node) return NULL;
|
77
|
+
|
78
|
+
if(bwRead(&(node->isLeaf), sizeof(uint8_t), 1, fp) != 1) goto error;
|
79
|
+
if(bwRead(&padding, sizeof(uint8_t), 1, fp) != 1) goto error;
|
80
|
+
if(bwRead(&(node->nChildren), sizeof(uint16_t), 1, fp) != 1) goto error;
|
81
|
+
|
82
|
+
node->chrIdxStart = malloc(sizeof(uint32_t)*(node->nChildren));
|
83
|
+
if(!node->chrIdxStart) goto error;
|
84
|
+
node->baseStart = malloc(sizeof(uint32_t)*(node->nChildren));
|
85
|
+
if(!node->baseStart) goto error;
|
86
|
+
node->chrIdxEnd = malloc(sizeof(uint32_t)*(node->nChildren));
|
87
|
+
if(!node->chrIdxEnd) goto error;
|
88
|
+
node->baseEnd = malloc(sizeof(uint32_t)*(node->nChildren));
|
89
|
+
if(!node->baseEnd) goto error;
|
90
|
+
node->dataOffset = malloc(sizeof(uint64_t)*(node->nChildren));
|
91
|
+
if(!node->dataOffset) goto error;
|
92
|
+
if(node->isLeaf) {
|
93
|
+
node->x.size = malloc(node->nChildren * sizeof(uint64_t));
|
94
|
+
if(!node->x.size) goto error;
|
95
|
+
} else {
|
96
|
+
node->x.child = calloc(node->nChildren, sizeof(struct bwRTreeNode_t *));
|
97
|
+
if(!node->x.child) goto error;
|
98
|
+
}
|
99
|
+
for(i=0; i<node->nChildren; i++) {
|
100
|
+
if(bwRead(&(node->chrIdxStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error;
|
101
|
+
if(bwRead(&(node->baseStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error;
|
102
|
+
if(bwRead(&(node->chrIdxEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error;
|
103
|
+
if(bwRead(&(node->baseEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error;
|
104
|
+
if(bwRead(&(node->dataOffset[i]), sizeof(uint64_t), 1, fp) != 1) goto error;
|
105
|
+
if(node->isLeaf) {
|
106
|
+
if(bwRead(&(node->x.size[i]), sizeof(uint64_t), 1, fp) != 1) goto error;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
return node;
|
111
|
+
|
112
|
+
error:
|
113
|
+
if(node->chrIdxStart) free(node->chrIdxStart);
|
114
|
+
if(node->baseStart) free(node->baseStart);
|
115
|
+
if(node->chrIdxEnd) free(node->chrIdxEnd);
|
116
|
+
if(node->baseEnd) free(node->baseEnd);
|
117
|
+
if(node->dataOffset) free(node->dataOffset);
|
118
|
+
if(node->isLeaf && node->x.size) free(node->x.size);
|
119
|
+
else if((!node->isLeaf) && node->x.child) free(node->x.child);
|
120
|
+
free(node);
|
121
|
+
return NULL;
|
122
|
+
}
|
123
|
+
|
124
|
+
void destroyBWOverlapBlock(bwOverlapBlock_t *b) {
|
125
|
+
if(!b) return;
|
126
|
+
if(b->size) free(b->size);
|
127
|
+
if(b->offset) free(b->offset);
|
128
|
+
free(b);
|
129
|
+
}
|
130
|
+
|
131
|
+
//Returns a bwOverlapBlock_t * object or NULL on error.
|
132
|
+
static bwOverlapBlock_t *overlapsLeaf(bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) {
|
133
|
+
uint16_t i, idx = 0;
|
134
|
+
bwOverlapBlock_t *o = calloc(1, sizeof(bwOverlapBlock_t));
|
135
|
+
if(!o) return NULL;
|
136
|
+
|
137
|
+
for(i=0; i<node->nChildren; i++) {
|
138
|
+
if(tid < node->chrIdxStart[i]) break;
|
139
|
+
if(tid > node->chrIdxEnd[i]) continue;
|
140
|
+
|
141
|
+
/*
|
142
|
+
The individual blocks can theoretically span multiple contigs.
|
143
|
+
So if we treat the first/last contig in the range as special
|
144
|
+
but anything in the middle is a guaranteed match
|
145
|
+
*/
|
146
|
+
if(node->chrIdxStart[i] != node->chrIdxEnd[i]) {
|
147
|
+
if(tid == node->chrIdxStart[i]) {
|
148
|
+
if(node->baseStart[i] >= end) break;
|
149
|
+
} else if(tid == node->chrIdxEnd[i]) {
|
150
|
+
if(node->baseEnd[i] <= start) continue;
|
151
|
+
}
|
152
|
+
} else {
|
153
|
+
if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue;
|
154
|
+
}
|
155
|
+
o->n++;
|
156
|
+
}
|
157
|
+
|
158
|
+
if(o->n) {
|
159
|
+
o->offset = malloc(sizeof(uint64_t) * (o->n));
|
160
|
+
if(!o->offset) goto error;
|
161
|
+
o->size = malloc(sizeof(uint64_t) * (o->n));
|
162
|
+
if(!o->size) goto error;
|
163
|
+
|
164
|
+
for(i=0; i<node->nChildren; i++) {
|
165
|
+
if(tid < node->chrIdxStart[i]) break;
|
166
|
+
if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue;
|
167
|
+
if(node->chrIdxStart[i] != node->chrIdxEnd[i]) {
|
168
|
+
if(tid == node->chrIdxStart[i]) {
|
169
|
+
if(node->baseStart[i] >= end) continue;
|
170
|
+
} else if(tid == node->chrIdxEnd[i]) {
|
171
|
+
if(node->baseEnd[i] <= start) continue;
|
172
|
+
}
|
173
|
+
} else {
|
174
|
+
if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue;
|
175
|
+
}
|
176
|
+
o->offset[idx] = node->dataOffset[i];
|
177
|
+
o->size[idx++] = node->x.size[i];
|
178
|
+
if(idx >= o->n) break;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
if(idx != o->n) { //This should never happen
|
183
|
+
fprintf(stderr, "[overlapsLeaf] Mismatch between number of overlaps calculated and found!\n");
|
184
|
+
goto error;
|
185
|
+
}
|
186
|
+
|
187
|
+
return o;
|
188
|
+
|
189
|
+
error:
|
190
|
+
if(o) destroyBWOverlapBlock(o);
|
191
|
+
return NULL;
|
192
|
+
}
|
193
|
+
|
194
|
+
//This will free l2 unless there's an error!
|
195
|
+
//Returns NULL on error, otherwise the merged lists
|
196
|
+
static bwOverlapBlock_t *mergeOverlapBlocks(bwOverlapBlock_t *b1, bwOverlapBlock_t *b2) {
|
197
|
+
uint64_t i,j;
|
198
|
+
if(!b2) return b1;
|
199
|
+
if(!b2->n) {
|
200
|
+
destroyBWOverlapBlock(b2);
|
201
|
+
return b1;
|
202
|
+
}
|
203
|
+
if(!b1->n) {
|
204
|
+
destroyBWOverlapBlock(b1);
|
205
|
+
return b2;
|
206
|
+
}
|
207
|
+
j = b1->n;
|
208
|
+
b1->n += b2->n;
|
209
|
+
b1->offset = realloc(b1->offset, sizeof(uint64_t) * (b1->n+b2->n));
|
210
|
+
if(!b1->offset) goto error;
|
211
|
+
b1->size = realloc(b1->size, sizeof(uint64_t) * (b1->n+b2->n));
|
212
|
+
if(!b1->size) goto error;
|
213
|
+
|
214
|
+
for(i=0; i<b2->n; i++) {
|
215
|
+
b1->offset[j+i] = b2->offset[i];
|
216
|
+
b1->size[j+i] = b2->size[i];
|
217
|
+
}
|
218
|
+
destroyBWOverlapBlock(b2);
|
219
|
+
return b1;
|
220
|
+
|
221
|
+
error:
|
222
|
+
destroyBWOverlapBlock(b1);
|
223
|
+
return NULL;
|
224
|
+
}
|
225
|
+
|
226
|
+
//Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned
|
227
|
+
//The output needs to be free()d if not NULL (likewise with *sizes)
|
228
|
+
static bwOverlapBlock_t *overlapsNonLeaf(bigWigFile_t *fp, bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) {
|
229
|
+
uint16_t i;
|
230
|
+
bwOverlapBlock_t *nodeBlocks, *output = calloc(1, sizeof(bwOverlapBlock_t));
|
231
|
+
if(!output) return NULL;
|
232
|
+
|
233
|
+
for(i=0; i<node->nChildren; i++) {
|
234
|
+
if(tid < node->chrIdxStart[i]) break;
|
235
|
+
if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue;
|
236
|
+
if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { //child spans contigs
|
237
|
+
if(tid == node->chrIdxStart[i]) {
|
238
|
+
if(node->baseStart[i] >= end) continue;
|
239
|
+
} else if(tid == node->chrIdxEnd[i]) {
|
240
|
+
if(node->baseEnd[i] <= start) continue;
|
241
|
+
}
|
242
|
+
} else {
|
243
|
+
if(end <= node->baseStart[i] || start >= node->baseEnd[i]) continue;
|
244
|
+
}
|
245
|
+
|
246
|
+
//We have an overlap!
|
247
|
+
if(!node->x.child[i])
|
248
|
+
node->x.child[i] = bwGetRTreeNode(fp, node->dataOffset[i]);
|
249
|
+
if(!node->x.child[i]) goto error;
|
250
|
+
|
251
|
+
if(node->x.child[i]->isLeaf) { //leaf
|
252
|
+
nodeBlocks = overlapsLeaf(node->x.child[i], tid, start, end);
|
253
|
+
} else { //non-leaf
|
254
|
+
nodeBlocks = overlapsNonLeaf(fp, node->x.child[i], tid, start, end);
|
255
|
+
}
|
256
|
+
|
257
|
+
//The output is processed the same regardless of leaf/non-leaf
|
258
|
+
if(!nodeBlocks) goto error;
|
259
|
+
else {
|
260
|
+
output = mergeOverlapBlocks(output, nodeBlocks);
|
261
|
+
if(!output) {
|
262
|
+
destroyBWOverlapBlock(nodeBlocks);
|
263
|
+
goto error;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
}
|
267
|
+
|
268
|
+
return output;
|
269
|
+
|
270
|
+
error:
|
271
|
+
destroyBWOverlapBlock(output);
|
272
|
+
return NULL;
|
273
|
+
}
|
274
|
+
|
275
|
+
//Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned
|
276
|
+
//The output must be free()d
|
277
|
+
bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end) {
|
278
|
+
if(root->isLeaf) return overlapsLeaf(root, tid, start, end);
|
279
|
+
return overlapsNonLeaf(bw, root, tid, start, end);
|
280
|
+
}
|
281
|
+
|
282
|
+
//In reality, a hash or some sort of tree structure is probably faster...
|
283
|
+
//Return -1 (AKA 0xFFFFFFFF...) on "not there", so we can hold (2^32)-1 items.
|
284
|
+
uint32_t bwGetTid(const bigWigFile_t *fp, const char *chrom) {
|
285
|
+
uint32_t i;
|
286
|
+
if(!chrom) return -1;
|
287
|
+
for(i=0; i<fp->cl->nKeys; i++) {
|
288
|
+
if(strcmp(chrom, fp->cl->chrom[i]) == 0) return i;
|
289
|
+
}
|
290
|
+
return -1;
|
291
|
+
}
|
292
|
+
|
293
|
+
static bwOverlapBlock_t *bwGetOverlappingBlocks(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end) {
|
294
|
+
uint32_t tid = bwGetTid(fp, chrom);
|
295
|
+
|
296
|
+
if(tid == (uint32_t) -1) {
|
297
|
+
fprintf(stderr, "[bwGetOverlappingBlocks] Non-existent contig: %s\n", chrom);
|
298
|
+
return NULL;
|
299
|
+
}
|
300
|
+
|
301
|
+
//Get the info if needed
|
302
|
+
if(!fp->idx) {
|
303
|
+
fp->idx = readRTreeIdx(fp, fp->hdr->indexOffset);
|
304
|
+
if(!fp->idx) {
|
305
|
+
return NULL;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
|
309
|
+
if(!fp->idx->root) fp->idx->root = bwGetRTreeNode(fp, 0);
|
310
|
+
if(!fp->idx->root) return NULL;
|
311
|
+
|
312
|
+
return walkRTreeNodes(fp, fp->idx->root, tid, start, end);
|
313
|
+
}
|
314
|
+
|
315
|
+
void bwFillDataHdr(bwDataHeader_t *hdr, void *b) {
|
316
|
+
hdr->tid = ((uint32_t*)b)[0];
|
317
|
+
hdr->start = ((uint32_t*)b)[1];
|
318
|
+
hdr->end = ((uint32_t*)b)[2];
|
319
|
+
hdr->step = ((uint32_t*)b)[3];
|
320
|
+
hdr->span = ((uint32_t*)b)[4];
|
321
|
+
hdr->type = ((uint8_t*)b)[20];
|
322
|
+
hdr->nItems = ((uint16_t*)b)[11];
|
323
|
+
}
|
324
|
+
|
325
|
+
void bwDestroyOverlappingIntervals(bwOverlappingIntervals_t *o) {
|
326
|
+
if(!o) return;
|
327
|
+
if(o->start) free(o->start);
|
328
|
+
if(o->end) free(o->end);
|
329
|
+
if(o->value) free(o->value);
|
330
|
+
free(o);
|
331
|
+
}
|
332
|
+
|
333
|
+
void bbDestroyOverlappingEntries(bbOverlappingEntries_t *o) {
|
334
|
+
uint32_t i;
|
335
|
+
if(!o) return;
|
336
|
+
if(o->start) free(o->start);
|
337
|
+
if(o->end) free(o->end);
|
338
|
+
if(o->str) {
|
339
|
+
for(i=0; i<o->l; i++) {
|
340
|
+
if(o->str[i]) free(o->str[i]);
|
341
|
+
}
|
342
|
+
free(o->str);
|
343
|
+
}
|
344
|
+
free(o);
|
345
|
+
}
|
346
|
+
|
347
|
+
//Returns NULL on error, in which case o has been free()d
|
348
|
+
static bwOverlappingIntervals_t *pushIntervals(bwOverlappingIntervals_t *o, uint32_t start, uint32_t end, float value) {
|
349
|
+
if(o->l+1 >= o->m) {
|
350
|
+
o->m = roundup(o->l+1);
|
351
|
+
o->start = realloc(o->start, o->m * sizeof(uint32_t));
|
352
|
+
if(!o->start) goto error;
|
353
|
+
o->end = realloc(o->end, o->m * sizeof(uint32_t));
|
354
|
+
if(!o->end) goto error;
|
355
|
+
o->value = realloc(o->value, o->m * sizeof(float));
|
356
|
+
if(!o->value) goto error;
|
357
|
+
}
|
358
|
+
o->start[o->l] = start;
|
359
|
+
o->end[o->l] = end;
|
360
|
+
o->value[o->l++] = value;
|
361
|
+
return o;
|
362
|
+
|
363
|
+
error:
|
364
|
+
bwDestroyOverlappingIntervals(o);
|
365
|
+
return NULL;
|
366
|
+
}
|
367
|
+
|
368
|
+
static bbOverlappingEntries_t *pushBBIntervals(bbOverlappingEntries_t *o, uint32_t start, uint32_t end, char *str, int withString) {
|
369
|
+
if(o->l+1 >= o->m) {
|
370
|
+
o->m = roundup(o->l+1);
|
371
|
+
o->start = realloc(o->start, o->m * sizeof(uint32_t));
|
372
|
+
if(!o->start) goto error;
|
373
|
+
o->end = realloc(o->end, o->m * sizeof(uint32_t));
|
374
|
+
if(!o->end) goto error;
|
375
|
+
if(withString) {
|
376
|
+
o->str = realloc(o->str, o->m * sizeof(char**));
|
377
|
+
if(!o->str) goto error;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
o->start[o->l] = start;
|
381
|
+
o->end[o->l] = end;
|
382
|
+
if(withString) o->str[o->l] = bwStrdup(str);
|
383
|
+
o->l++;
|
384
|
+
return o;
|
385
|
+
|
386
|
+
error:
|
387
|
+
bbDestroyOverlappingEntries(o);
|
388
|
+
return NULL;
|
389
|
+
}
|
390
|
+
|
391
|
+
//Returns NULL on error
|
392
|
+
bwOverlappingIntervals_t *bwGetOverlappingIntervalsCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend) {
|
393
|
+
uint64_t i;
|
394
|
+
uint16_t j;
|
395
|
+
int compressed = 0, rv;
|
396
|
+
uLongf sz = fp->hdr->bufSize, tmp;
|
397
|
+
void *buf = NULL, *compBuf = NULL;
|
398
|
+
uint32_t start = 0, end , *p;
|
399
|
+
float value;
|
400
|
+
bwDataHeader_t hdr;
|
401
|
+
bwOverlappingIntervals_t *output = calloc(1, sizeof(bwOverlappingIntervals_t));
|
402
|
+
|
403
|
+
if(!output) goto error;
|
404
|
+
|
405
|
+
if(!o) return output;
|
406
|
+
if(!o->n) return output;
|
407
|
+
|
408
|
+
if(sz) {
|
409
|
+
compressed = 1;
|
410
|
+
buf = malloc(sz);
|
411
|
+
}
|
412
|
+
sz = 0; //This is now the size of the compressed buffer
|
413
|
+
|
414
|
+
for(i=0; i<o->n; i++) {
|
415
|
+
if(bwSetPos(fp, o->offset[i])) goto error;
|
416
|
+
|
417
|
+
if(sz < o->size[i]) {
|
418
|
+
compBuf = realloc(compBuf, o->size[i]);
|
419
|
+
sz = o->size[i];
|
420
|
+
}
|
421
|
+
if(!compBuf) goto error;
|
422
|
+
|
423
|
+
if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
|
424
|
+
if(compressed) {
|
425
|
+
tmp = fp->hdr->bufSize; //This gets over-written by uncompress
|
426
|
+
rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]);
|
427
|
+
if(rv != Z_OK) goto error;
|
428
|
+
} else {
|
429
|
+
buf = compBuf;
|
430
|
+
}
|
431
|
+
|
432
|
+
//TODO: ensure that tmp is large enough!
|
433
|
+
bwFillDataHdr(&hdr, buf);
|
434
|
+
|
435
|
+
p = ((uint32_t*) buf);
|
436
|
+
p += 6;
|
437
|
+
if(hdr.tid != tid) continue;
|
438
|
+
|
439
|
+
if(hdr.type == 3) start = hdr.start - hdr.step;
|
440
|
+
|
441
|
+
//FIXME: We should ensure that sz is large enough to hold nItems of the given type
|
442
|
+
for(j=0; j<hdr.nItems; j++) {
|
443
|
+
switch(hdr.type) {
|
444
|
+
case 1:
|
445
|
+
start = *p;
|
446
|
+
p++;
|
447
|
+
end = *p;
|
448
|
+
p++;
|
449
|
+
value = *((float *)p);
|
450
|
+
p++;
|
451
|
+
break;
|
452
|
+
case 2:
|
453
|
+
start = *p;
|
454
|
+
p++;
|
455
|
+
end = start + hdr.span;
|
456
|
+
value = *((float *)p);
|
457
|
+
p++;
|
458
|
+
break;
|
459
|
+
case 3:
|
460
|
+
start += hdr.step;
|
461
|
+
end = start+hdr.span;
|
462
|
+
value = *((float *)p);
|
463
|
+
p++;
|
464
|
+
break;
|
465
|
+
default :
|
466
|
+
goto error;
|
467
|
+
break;
|
468
|
+
}
|
469
|
+
|
470
|
+
if(end <= ostart || start >= oend) continue;
|
471
|
+
//Push the overlap
|
472
|
+
if(!pushIntervals(output, start, end, value)) goto error;
|
473
|
+
}
|
474
|
+
}
|
475
|
+
|
476
|
+
if(compressed && buf) free(buf);
|
477
|
+
if(compBuf) free(compBuf);
|
478
|
+
return output;
|
479
|
+
|
480
|
+
error:
|
481
|
+
fprintf(stderr, "[bwGetOverlappingIntervalsCore] Got an error\n");
|
482
|
+
if(output) bwDestroyOverlappingIntervals(output);
|
483
|
+
if(compressed && buf) free(buf);
|
484
|
+
if(compBuf) free(compBuf);
|
485
|
+
return NULL;
|
486
|
+
}
|
487
|
+
|
488
|
+
bbOverlappingEntries_t *bbGetOverlappingEntriesCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend, int withString) {
|
489
|
+
uint64_t i;
|
490
|
+
int compressed = 0, rv, slen;
|
491
|
+
uLongf sz = fp->hdr->bufSize, tmp = 0;
|
492
|
+
void *buf = NULL, *bufEnd = NULL, *compBuf = NULL;
|
493
|
+
uint32_t entryTid = 0, start = 0, end;
|
494
|
+
char *str;
|
495
|
+
bbOverlappingEntries_t *output = calloc(1, sizeof(bbOverlappingEntries_t));
|
496
|
+
|
497
|
+
if(!output) goto error;
|
498
|
+
|
499
|
+
if(!o) return output;
|
500
|
+
if(!o->n) return output;
|
501
|
+
|
502
|
+
if(sz) {
|
503
|
+
compressed = 1;
|
504
|
+
buf = malloc(sz);
|
505
|
+
}
|
506
|
+
sz = 0; //This is now the size of the compressed buffer
|
507
|
+
|
508
|
+
for(i=0; i<o->n; i++) {
|
509
|
+
if(bwSetPos(fp, o->offset[i])) goto error;
|
510
|
+
|
511
|
+
if(sz < o->size[i]) {
|
512
|
+
compBuf = realloc(compBuf, o->size[i]);
|
513
|
+
sz = o->size[i];
|
514
|
+
}
|
515
|
+
if(!compBuf) goto error;
|
516
|
+
|
517
|
+
if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
|
518
|
+
if(compressed) {
|
519
|
+
tmp = fp->hdr->bufSize; //This gets over-written by uncompress
|
520
|
+
rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]);
|
521
|
+
if(rv != Z_OK) goto error;
|
522
|
+
} else {
|
523
|
+
buf = compBuf;
|
524
|
+
tmp = o->size[i]; //TODO: Is this correct? Do non-gzipped bigBeds exist?
|
525
|
+
}
|
526
|
+
|
527
|
+
bufEnd = buf + tmp;
|
528
|
+
while(buf < bufEnd) {
|
529
|
+
entryTid = ((uint32_t*)buf)[0];
|
530
|
+
start = ((uint32_t*)buf)[1];
|
531
|
+
end = ((uint32_t*)buf)[2];
|
532
|
+
buf += 12;
|
533
|
+
str = (char*)buf;
|
534
|
+
slen = strlen(str) + 1;
|
535
|
+
buf += slen;
|
536
|
+
|
537
|
+
if(entryTid < tid) continue;
|
538
|
+
if(entryTid > tid) break;
|
539
|
+
if(end <= ostart) continue;
|
540
|
+
if(start >= oend) break;
|
541
|
+
|
542
|
+
//Push the overlap
|
543
|
+
if(!pushBBIntervals(output, start, end, str, withString)) goto error;
|
544
|
+
}
|
545
|
+
|
546
|
+
buf = bufEnd - tmp; //reset the buffer pointer
|
547
|
+
}
|
548
|
+
|
549
|
+
if(compressed && buf) free(buf);
|
550
|
+
if(compBuf) free(compBuf);
|
551
|
+
return output;
|
552
|
+
|
553
|
+
error:
|
554
|
+
fprintf(stderr, "[bbGetOverlappingEntriesCore] Got an error\n");
|
555
|
+
buf = bufEnd - tmp;
|
556
|
+
if(output) bbDestroyOverlappingEntries(output);
|
557
|
+
if(compressed && buf) free(buf);
|
558
|
+
if(compBuf) free(compBuf);
|
559
|
+
return NULL;
|
560
|
+
}
|
561
|
+
|
562
|
+
//Returns NULL on error OR no intervals, which is a bad design...
|
563
|
+
bwOverlappingIntervals_t *bwGetOverlappingIntervals(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end) {
|
564
|
+
bwOverlappingIntervals_t *output;
|
565
|
+
uint32_t tid = bwGetTid(fp, chrom);
|
566
|
+
if(tid == (uint32_t) -1) return NULL;
|
567
|
+
bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end);
|
568
|
+
if(!blocks) return NULL;
|
569
|
+
output = bwGetOverlappingIntervalsCore(fp, blocks, tid, start, end);
|
570
|
+
destroyBWOverlapBlock(blocks);
|
571
|
+
return output;
|
572
|
+
}
|
573
|
+
|
574
|
+
//Like above, but for bigBed files
|
575
|
+
bbOverlappingEntries_t *bbGetOverlappingEntries(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int withString) {
|
576
|
+
bbOverlappingEntries_t *output;
|
577
|
+
uint32_t tid = bwGetTid(fp, chrom);
|
578
|
+
if(tid == (uint32_t) -1) return NULL;
|
579
|
+
bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end);
|
580
|
+
if(!blocks) return NULL;
|
581
|
+
output = bbGetOverlappingEntriesCore(fp, blocks, tid, start, end, withString);
|
582
|
+
destroyBWOverlapBlock(blocks);
|
583
|
+
return output;
|
584
|
+
}
|
585
|
+
|
586
|
+
//Returns NULL on error
|
587
|
+
bwOverlapIterator_t *bwOverlappingIntervalsIterator(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t blocksPerIteration) {
|
588
|
+
bwOverlapIterator_t *output = NULL;
|
589
|
+
uint64_t n;
|
590
|
+
uint32_t tid = bwGetTid(fp, chrom);
|
591
|
+
if(tid == (uint32_t) -1) return output;
|
592
|
+
output = calloc(1, sizeof(bwOverlapIterator_t));
|
593
|
+
if(!output) return output;
|
594
|
+
bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end);
|
595
|
+
|
596
|
+
output->bw = fp;
|
597
|
+
output->tid = tid;
|
598
|
+
output->start = start;
|
599
|
+
output->end = end;
|
600
|
+
output->blocks = blocks;
|
601
|
+
output->blocksPerIteration = blocksPerIteration;
|
602
|
+
|
603
|
+
if(blocks) {
|
604
|
+
n = blocks->n;
|
605
|
+
if(n>blocksPerIteration) blocks->n = blocksPerIteration;
|
606
|
+
output->intervals = bwGetOverlappingIntervalsCore(fp, blocks,tid, start, end);
|
607
|
+
blocks->n = n;
|
608
|
+
output->offset = blocksPerIteration;
|
609
|
+
}
|
610
|
+
output->data = output->intervals;
|
611
|
+
return output;
|
612
|
+
}
|
613
|
+
|
614
|
+
//Returns NULL on error
|
615
|
+
bwOverlapIterator_t *bbOverlappingEntriesIterator(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int withString, uint32_t blocksPerIteration) {
|
616
|
+
bwOverlapIterator_t *output = NULL;
|
617
|
+
uint64_t n;
|
618
|
+
uint32_t tid = bwGetTid(fp, chrom);
|
619
|
+
if(tid == (uint32_t) -1) return output;
|
620
|
+
output = calloc(1, sizeof(bwOverlapIterator_t));
|
621
|
+
if(!output) return output;
|
622
|
+
bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end);
|
623
|
+
|
624
|
+
output->bw = fp;
|
625
|
+
output->tid = tid;
|
626
|
+
output->start = start;
|
627
|
+
output->end = end;
|
628
|
+
output->blocks = blocks;
|
629
|
+
output->blocksPerIteration = blocksPerIteration;
|
630
|
+
output->withString = withString;
|
631
|
+
|
632
|
+
if(blocks) {
|
633
|
+
n = blocks->n;
|
634
|
+
if(n>blocksPerIteration) blocks->n = blocksPerIteration;
|
635
|
+
output->entries = bbGetOverlappingEntriesCore(fp, blocks,tid, start, end, withString);
|
636
|
+
blocks->n = n;
|
637
|
+
output->offset = blocksPerIteration;
|
638
|
+
}
|
639
|
+
output->data = output->entries;
|
640
|
+
return output;
|
641
|
+
}
|
642
|
+
|
643
|
+
void bwIteratorDestroy(bwOverlapIterator_t *iter) {
|
644
|
+
if(!iter) return;
|
645
|
+
if(iter->blocks) destroyBWOverlapBlock((bwOverlapBlock_t*) iter->blocks);
|
646
|
+
if(iter->intervals) bwDestroyOverlappingIntervals(iter->intervals);
|
647
|
+
if(iter->entries) bbDestroyOverlappingEntries(iter->entries);
|
648
|
+
free(iter);
|
649
|
+
}
|
650
|
+
|
651
|
+
//On error, points to NULL and destroys the input
|
652
|
+
bwOverlapIterator_t *bwIteratorNext(bwOverlapIterator_t *iter) {
|
653
|
+
uint64_t n, *offset, *size;
|
654
|
+
bwOverlapBlock_t *blocks = iter->blocks;
|
655
|
+
|
656
|
+
if(iter->intervals) {
|
657
|
+
bwDestroyOverlappingIntervals(iter->intervals);
|
658
|
+
iter->intervals = NULL;
|
659
|
+
}
|
660
|
+
if(iter->entries) {
|
661
|
+
bbDestroyOverlappingEntries(iter->entries);
|
662
|
+
iter->entries = NULL;
|
663
|
+
}
|
664
|
+
iter->data = NULL;
|
665
|
+
|
666
|
+
if(iter->offset < blocks->n) {
|
667
|
+
//store the previous values
|
668
|
+
n = blocks->n;
|
669
|
+
offset = blocks->offset;
|
670
|
+
size = blocks->size;
|
671
|
+
|
672
|
+
//Move the start of the blocks
|
673
|
+
blocks->offset += iter->offset;
|
674
|
+
blocks->size += iter->offset;
|
675
|
+
if(iter->offset + iter->blocksPerIteration > n) {
|
676
|
+
blocks->n = blocks->n - iter->offset;
|
677
|
+
} else {
|
678
|
+
blocks->n = iter->blocksPerIteration;
|
679
|
+
}
|
680
|
+
|
681
|
+
//Get the intervals or entries, as appropriate
|
682
|
+
if(iter->bw->type == 0) {
|
683
|
+
//bigWig
|
684
|
+
iter->intervals = bwGetOverlappingIntervalsCore(iter->bw, blocks, iter->tid, iter->start, iter->end);
|
685
|
+
iter->data = iter->intervals;
|
686
|
+
} else {
|
687
|
+
//bigBed
|
688
|
+
iter->entries = bbGetOverlappingEntriesCore(iter->bw, blocks, iter->tid, iter->start, iter->end, iter->withString);
|
689
|
+
iter->data = iter->entries;
|
690
|
+
}
|
691
|
+
iter->offset += iter->blocksPerIteration;
|
692
|
+
|
693
|
+
//reset the values in iter->blocks
|
694
|
+
blocks->n = n;
|
695
|
+
blocks->offset = offset;
|
696
|
+
blocks->size = size;
|
697
|
+
|
698
|
+
//Check for error
|
699
|
+
if(!iter->intervals && !iter->entries) goto error;
|
700
|
+
}
|
701
|
+
|
702
|
+
return iter;
|
703
|
+
|
704
|
+
error:
|
705
|
+
bwIteratorDestroy(iter);
|
706
|
+
return NULL;
|
707
|
+
}
|
708
|
+
|
709
|
+
//This is like bwGetOverlappingIntervals, except it returns 1 base windows. If includeNA is not 0, then a value will be returned for every position in the range (defaulting to NAN).
|
710
|
+
//The ->end member is NULL
|
711
|
+
//If includeNA is not 0 then ->start is also NULL, since it's implied
|
712
|
+
//Note that bwDestroyOverlappingIntervals() will work in either case
|
713
|
+
bwOverlappingIntervals_t *bwGetValues(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, int includeNA) {
|
714
|
+
uint32_t i, j, n;
|
715
|
+
bwOverlappingIntervals_t *output = NULL;
|
716
|
+
bwOverlappingIntervals_t *intermediate = bwGetOverlappingIntervals(fp, chrom, start, end);
|
717
|
+
if(!intermediate) return NULL;
|
718
|
+
|
719
|
+
output = calloc(1, sizeof(bwOverlappingIntervals_t));
|
720
|
+
if(!output) goto error;
|
721
|
+
if(includeNA) {
|
722
|
+
output->l = end-start;
|
723
|
+
output->value = malloc(output->l*sizeof(float));
|
724
|
+
if(!output->value) goto error;
|
725
|
+
for(i=0; i<output->l; i++) output->value[i] = NAN;
|
726
|
+
for(i=0; i<intermediate->l; i++) {
|
727
|
+
for(j=intermediate->start[i]; j<intermediate->end[i]; j++) {
|
728
|
+
if(j < start || j >= end) continue;
|
729
|
+
output->value[j-start] = intermediate->value[i];
|
730
|
+
}
|
731
|
+
}
|
732
|
+
} else {
|
733
|
+
n = 0;
|
734
|
+
for(i=0; i<intermediate->l; i++) {
|
735
|
+
if(intermediate->start[i] < start) intermediate->start[i] = start;
|
736
|
+
if(intermediate->end[i] > end) intermediate->end[i] = end;
|
737
|
+
n += intermediate->end[i]-intermediate->start[i];
|
738
|
+
}
|
739
|
+
output->l = n;
|
740
|
+
output->start = malloc(sizeof(uint32_t)*n);
|
741
|
+
if(!output->start) goto error;
|
742
|
+
output->value = malloc(sizeof(float)*n);
|
743
|
+
if(!output->value) goto error;
|
744
|
+
n = 0; //this is now the index
|
745
|
+
for(i=0; i<intermediate->l; i++) {
|
746
|
+
for(j=intermediate->start[i]; j<intermediate->end[i]; j++) {
|
747
|
+
if(j < start || j >= end) continue;
|
748
|
+
output->start[n] = j;
|
749
|
+
output->value[n++] = intermediate->value[i];
|
750
|
+
}
|
751
|
+
}
|
752
|
+
}
|
753
|
+
|
754
|
+
bwDestroyOverlappingIntervals(intermediate);
|
755
|
+
return output;
|
756
|
+
|
757
|
+
error:
|
758
|
+
if(intermediate) bwDestroyOverlappingIntervals(intermediate);
|
759
|
+
if(output) bwDestroyOverlappingIntervals(output);
|
760
|
+
return NULL;
|
761
|
+
}
|
762
|
+
|
763
|
+
void bwDestroyIndexNode(bwRTreeNode_t *node) {
|
764
|
+
uint16_t i;
|
765
|
+
|
766
|
+
if(!node) return;
|
767
|
+
|
768
|
+
free(node->chrIdxStart);
|
769
|
+
free(node->baseStart);
|
770
|
+
free(node->chrIdxEnd);
|
771
|
+
free(node->baseEnd);
|
772
|
+
free(node->dataOffset);
|
773
|
+
if(!node->isLeaf) {
|
774
|
+
for(i=0; i<node->nChildren; i++) {
|
775
|
+
bwDestroyIndexNode(node->x.child[i]);
|
776
|
+
}
|
777
|
+
free(node->x.child);
|
778
|
+
} else {
|
779
|
+
free(node->x.size);
|
780
|
+
}
|
781
|
+
free(node);
|
782
|
+
}
|
783
|
+
|
784
|
+
void bwDestroyIndex(bwRTree_t *idx) {
|
785
|
+
bwDestroyIndexNode(idx->root);
|
786
|
+
free(idx);
|
787
|
+
}
|
788
|
+
|
789
|
+
//Returns a pointer to the requested index (@offset, unless it's 0, in which case the index for the values is returned
|
790
|
+
//Returns NULL on error
|
791
|
+
bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset) {
|
792
|
+
bwRTree_t *idx = readRTreeIdx(fp, offset);
|
793
|
+
if(!idx) return NULL;
|
794
|
+
|
795
|
+
//Read in the root node
|
796
|
+
idx->root = bwGetRTreeNode(fp, idx->rootOffset);
|
797
|
+
|
798
|
+
if(!idx->root) {
|
799
|
+
bwDestroyIndex(idx);
|
800
|
+
return NULL;
|
801
|
+
}
|
802
|
+
return idx;
|
803
|
+
}
|